Revert "Feat/speech support (#392)"

This reverts commit c9dcfe287f.
This commit is contained in:
Acbox
2026-04-22 00:10:36 +08:00
committed by GitHub
parent c9dcfe287f
commit 63fe03cfff
70 changed files with 1689 additions and 6609 deletions
+14 -49
View File
@@ -18,7 +18,6 @@
<div class="flex flex-col gap-3 mt-4">
<!-- Type -->
<FormField
v-if="!hideType"
v-slot="{ componentField }"
name="type"
>
@@ -36,12 +35,11 @@
</SelectTrigger>
<SelectContent>
<SelectGroup>
<SelectItem
v-for="opt in typeOptions"
:key="opt.value"
:value="opt.value"
>
{{ opt.label }}
<SelectItem value="chat">
Chat
</SelectItem>
<SelectItem value="embedding">
Embedding
</SelectItem>
</SelectGroup>
</SelectContent>
@@ -183,11 +181,6 @@ import { COMPATIBILITY_OPTIONS } from '@/constants/compatibilities'
import FormDialogShell from '@/components/form-dialog-shell/index.vue'
import { useDialogMutation } from '@/composables/useDialogMutation'
interface ModelTypeOption {
value: string
label: string
}
const selectedCompat = ref<string[]>([])
const { t } = useI18n()
const { run } = useDialogMutation()
@@ -200,30 +193,14 @@ const formSchema = toTypedSchema(z.object({
context_window: z.coerce.number().min(1).optional(),
}))
const props = withDefaults(defineProps<{
id: string
typeOptions?: ModelTypeOption[]
defaultType?: string
hideType?: boolean
invalidateKeys?: string[]
}>(), {
typeOptions: () => [
{ value: 'chat', label: 'Chat' },
{ value: 'embedding', label: 'Embedding' },
],
defaultType: 'chat',
hideType: false,
invalidateKeys: () => ['provider-models'],
})
const form = useForm({
validationSchema: formSchema,
initialValues: {
type: props.defaultType,
type: 'chat',
},
})
const selectedType = computed(() => form.values.type || props.defaultType)
const selectedType = computed(() => form.values.type || 'chat')
const open = inject<Ref<boolean>>('openModel', ref(false))
const title = inject<Ref<'edit' | 'title'>>('openModelTitle', ref('title'))
@@ -260,19 +237,15 @@ function onNameInput(e: Event) {
form.setFieldValue('name', (e.target as HTMLInputElement).value)
}
const queryCache = useQueryCache()
function invalidateModelQueries() {
for (const key of props.invalidateKeys) {
queryCache.invalidateQueries({ key: [key] })
}
}
const { id } = defineProps<{ id: string }>()
const queryCache = useQueryCache()
const { mutateAsync: createModel, isLoading: createLoading } = useMutation({
mutation: async (data: Record<string, unknown>) => {
const { data: result } = await postModels({ body: data as ModelsAddRequest, throwOnError: true })
return result
},
onSettled: invalidateModelQueries,
onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
})
const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({
mutation: async ({ id, data }: { id: string; data: Record<string, unknown> }) => {
@@ -283,7 +256,7 @@ const { mutateAsync: updateModel, isLoading: updateLoading } = useMutation({
})
return result
},
onSettled: invalidateModelQueries,
onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
})
const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading } = useMutation({
mutation: async ({ modelId, data }: { modelId: string; data: Record<string, unknown> }) => {
@@ -294,7 +267,7 @@ const { mutateAsync: updateModelByLegacyModelID, isLoading: updateLegacyLoading
})
return result
},
onSettled: invalidateModelQueries,
onSettled: () => queryCache.invalidateQueries({ key: ['provider-models'] }),
})
const isLoading = computed(() => createLoading.value || updateLoading.value || updateLegacyLoading.value)
@@ -324,7 +297,7 @@ async function addModel() {
const payload: Record<string, unknown> = {
type,
model_id,
provider_id: props.id,
provider_id: id,
config,
}
@@ -375,15 +348,7 @@ watch(open, async () => {
selectedCompat.value = config?.compatibilities ?? []
userEditedName.value = !!(name && name !== model_id)
} else {
form.resetForm({
values: {
type: props.defaultType,
model_id: '',
name: '',
dimensions: undefined,
context_window: undefined,
},
})
form.resetForm({ values: { type: 'chat', model_id: '', name: '', dimensions: undefined, context_window: undefined } })
selectedCompat.value = []
userEditedName.value = false
}
@@ -52,7 +52,7 @@ import { computed, type Component } from 'vue'
import { storeToRefs } from 'pinia'
import { useRouter, useRoute } from 'vue-router'
import { useI18n } from 'vue-i18n'
import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, AudioLines, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next'
import { ChevronLeft, Bot, Boxes, Globe, Brain, Volume2, Mail, AppWindow, ChartLine, User, Store, Info } from 'lucide-vue-next'
import { useChatSelectionStore } from '@/store/chat-selection'
import {
Sidebar,
@@ -118,11 +118,6 @@ const navItems = computed<{ title: string; name: string; icon: Component }[]>(()
name: 'speech',
icon: Volume2,
},
{
title: t('sidebar.transcription'),
name: 'transcription',
icon: AudioLines,
},
{
title: t('sidebar.email'),
name: 'email',
+1 -31
View File
@@ -45,41 +45,21 @@ export const CLIENT_TYPE_META: Record<string, ClientTypeMeta> = {
label: 'OpenAI Speech',
hint: 'OpenAI /audio/speech compatible TTS',
},
'openai-transcription': {
value: 'openai-transcription',
label: 'OpenAI Transcription',
hint: 'OpenAI audio transcription',
},
'openrouter-speech': {
value: 'openrouter-speech',
label: 'OpenRouter Speech',
hint: 'OpenRouter audio modality TTS',
},
'openrouter-transcription': {
value: 'openrouter-transcription',
label: 'OpenRouter Transcription',
hint: 'OpenRouter transcription models',
},
'elevenlabs-speech': {
value: 'elevenlabs-speech',
label: 'ElevenLabs Speech',
hint: 'ElevenLabs text-to-speech',
},
'elevenlabs-transcription': {
value: 'elevenlabs-transcription',
label: 'ElevenLabs Transcription',
hint: 'ElevenLabs speech-to-text',
},
'deepgram-speech': {
value: 'deepgram-speech',
label: 'Deepgram Speech',
hint: 'Deepgram TTS',
},
'deepgram-transcription': {
value: 'deepgram-transcription',
label: 'Deepgram Transcription',
hint: 'Deepgram speech-to-text',
},
'minimax-speech': {
value: 'minimax-speech',
label: 'MiniMax Speech',
@@ -100,19 +80,9 @@ export const CLIENT_TYPE_META: Record<string, ClientTypeMeta> = {
label: 'Microsoft Speech',
hint: 'Azure Cognitive Services TTS',
},
'google-speech': {
value: 'google-speech',
label: 'Google Speech',
hint: 'Gemini speech transcription',
},
'google-transcription': {
value: 'google-transcription',
label: 'Google Transcription',
hint: 'Gemini speech transcription',
},
}
export const CLIENT_TYPE_LIST: ClientTypeMeta[] = Object.values(CLIENT_TYPE_META)
export const LLM_CLIENT_TYPE_LIST: ClientTypeMeta[] = CLIENT_TYPE_LIST
.filter(ct => !ct.value.endsWith('-speech') && !ct.value.endsWith('-transcription'))
.filter(ct => !ct.value.endsWith('-speech'))
-27
View File
@@ -63,7 +63,6 @@
"webSearch": "Web Search",
"memory": "Memory",
"speech": "Speech",
"transcription": "Transcription",
"email": "Email",
"settings": "Settings",
"profile": "Profile",
@@ -426,9 +425,6 @@
"noModels": "No models found. Click \"Import Models\" to discover available models or \"Add Model\" to create one manually.",
"noCapabilities": "No capabilities available for this model.",
"saveSuccess": "Speech configuration saved",
"synthesis": {
"models": "Synthesis Models"
},
"advanced": {
"title": "Advanced Settings",
"description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults."
@@ -452,27 +448,6 @@
"failed": "Synthesis failed"
}
},
"transcription": {
"title": "Transcription",
"emptyTitle": "No Transcription Providers",
"emptyDescription": "Add a transcription provider to enable speech-to-text for your bots",
"models": "Transcription Models",
"noModels": "No transcription models found. Import available models or keep the default template model.",
"noCapabilities": "No capabilities available for this model.",
"importModels": "Import Models",
"importSuccess": "Transcription models imported successfully",
"importFailed": "Failed to import transcription models",
"saveSuccess": "Transcription configuration saved",
"advanced": {
"title": "Advanced Settings",
"description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults."
},
"test": {
"title": "Test Transcription",
"run": "Transcribe",
"failed": "Transcription failed"
}
},
"email": {
"title": "Email",
"add": "Add Email",
@@ -945,8 +920,6 @@
"memoryHealthUnavailable": "Unavailable",
"ttsModel": "TTS Model",
"ttsModelPlaceholder": "Select TTS model",
"transcriptionModel": "Transcription Model",
"transcriptionModelPlaceholder": "Select transcription model",
"imageModel": "Image Generation Model",
"imageModelDescription": "Model used for the generate_image tool. Must support image-output compatibility.",
"imageModelPlaceholder": "Select image model (optional)",
-27
View File
@@ -64,7 +64,6 @@
"webSearch": "搜索",
"memory": "记忆",
"speech": "语音",
"transcription": "转写",
"email": "邮件",
"profile": "用户",
"home": "首页",
@@ -422,9 +421,6 @@
"noModels": "暂无模型,点击\"导入模型\"发现可用模型,或点击\"新建模型\"手动创建。",
"noCapabilities": "该模型暂无可用能力信息。",
"saveSuccess": "语音配置已保存",
"synthesis": {
"models": "语音合成模型"
},
"advanced": {
"title": "高级设置",
"description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。"
@@ -448,27 +444,6 @@
"failed": "合成失败"
}
},
"transcription": {
"title": "语音转写",
"emptyTitle": "暂无转写提供方",
"emptyDescription": "添加转写提供方以为 Bot 启用语音转文字功能",
"models": "语音识别模型",
"noModels": "暂无语音识别模型,可导入可用模型,或保留默认模板模型。",
"importModels": "导入模型",
"importSuccess": "识别模型导入成功",
"importFailed": "识别模型导入失败",
"saveSuccess": "转写配置已保存",
"noCapabilities": "该模型暂无可用能力信息。",
"advanced": {
"title": "高级设置",
"description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。"
},
"test": {
"title": "测试识别",
"run": "开始识别",
"failed": "识别失败"
}
},
"email": {
"title": "邮件提供方",
"add": "添加邮件提供方",
@@ -941,8 +916,6 @@
"memoryHealthUnavailable": "暂不可用",
"ttsModel": "语音合成模型",
"ttsModelPlaceholder": "选择语音合成模型",
"transcriptionModel": "转写模型",
"transcriptionModelPlaceholder": "选择语音转写模型",
"imageModel": "图片生成模型",
"imageModelDescription": "用于 generate_image 工具的模型,必须支持 image-output 兼容性。",
"imageModelPlaceholder": "选择图片模型(可选)",
@@ -187,17 +187,6 @@
/>
</div>
<!-- Transcription Model -->
<div class="space-y-2">
<Label>{{ $t('bots.settings.transcriptionModel') }}</Label>
<TtsModelSelect
v-model="form.transcription_model_id"
:models="transcriptionModels"
:providers="ttsProviders"
:placeholder="$t('bots.settings.transcriptionModelPlaceholder')"
/>
</div>
<!-- Image Generation Model -->
<div class="space-y-2">
<Label>{{ $t('bots.settings.imageModel') }}</Label>
@@ -367,7 +356,7 @@ import MemoryProviderSelect from './memory-provider-select.vue'
import TtsModelSelect from './tts-model-select.vue'
import BrowserContextSelect from './browser-context-select.vue'
import { useQuery, useMutation, useQueryCache } from '@pinia/colada'
import { getBotsById, putBotsById, getBotsByBotIdSettings, putBotsByBotIdSettings, deleteBotsById, getModels, getProviders, getSearchProviders, getMemoryProviders, getSpeechProviders, getSpeechModels, getTranscriptionProviders, getTranscriptionModels, getBrowserContexts, getBotsByBotIdMemoryStatus, postBotsByBotIdMemoryRebuild } from '@memohai/sdk'
import { getBotsById, putBotsById, getBotsByBotIdSettings, putBotsByBotIdSettings, deleteBotsById, getModels, getProviders, getSearchProviders, getMemoryProviders, getSpeechProviders, getSpeechModels, getBrowserContexts, getBotsByBotIdMemoryStatus, postBotsByBotIdMemoryRebuild } from '@memohai/sdk'
import type { SettingsSettings } from '@memohai/sdk'
import type { Ref } from 'vue'
import { resolveApiErrorMessage } from '@/utils/api-error'
@@ -451,22 +440,6 @@ const { data: ttsModelData } = useQuery({
},
})
const { data: transcriptionModelData } = useQuery({
key: ['transcription-models'],
query: async () => {
const { data } = await getTranscriptionModels({ throwOnError: true })
return data
},
})
const { data: transcriptionProviderData } = useQuery({
key: ['transcription-providers'],
query: async () => {
const { data } = await getTranscriptionProviders({ throwOnError: true })
return data
},
})
const { data: browserContextData } = useQuery({
key: ['all-browser-contexts'],
query: async () => {
@@ -521,10 +494,7 @@ const searchProviders = computed(() => (searchProviderData.value ?? []).filter((
const memoryProviders = computed(() => memoryProviderData.value ?? [])
const ttsProviders = computed(() => (ttsProviderData.value ?? []).filter((p) => p.enable !== false))
const enabledTtsProviderIds = computed(() => new Set(ttsProviders.value.map((p) => p.id)))
const transcriptionProviders = computed(() => (transcriptionProviderData.value ?? []).filter((p: Record<string, unknown>) => p.enable !== false))
const enabledTranscriptionProviderIds = computed(() => new Set(transcriptionProviders.value.map((p: Record<string, unknown>) => p.id as string)))
const ttsModels = computed(() => (ttsModelData.value ?? []).filter((m: Record<string, unknown>) => enabledTtsProviderIds.value.has(m.provider_id as string)))
const transcriptionModels = computed(() => (transcriptionModelData.value ?? []).filter((m: Record<string, unknown>) => enabledTranscriptionProviderIds.value.has(m.provider_id as string)))
const browserContexts = computed(() => browserContextData.value ?? [])
// ---- Form ----
@@ -535,7 +505,6 @@ const form = reactive({
search_provider_id: '',
memory_provider_id: '',
tts_model_id: '',
transcription_model_id: '',
browser_context_id: '',
timezone: '',
language: '',
@@ -675,7 +644,6 @@ watch(settings, (val) => {
form.search_provider_id = val.search_provider_id ?? ''
form.memory_provider_id = val.memory_provider_id ?? ''
form.tts_model_id = val.tts_model_id ?? ''
form.transcription_model_id = val.transcription_model_id ?? ''
form.browser_context_id = val.browser_context_id ?? ''
form.language = val.language ?? ''
form.timezone = val.timezone ?? ''
@@ -698,7 +666,6 @@ const hasSettingsChanges = computed(() => {
|| form.search_provider_id !== (s.search_provider_id ?? '')
|| form.memory_provider_id !== (s.memory_provider_id ?? '')
|| form.tts_model_id !== (s.tts_model_id ?? '')
|| form.transcription_model_id !== (s.transcription_model_id ?? '')
|| form.browser_context_id !== (s.browser_context_id ?? '')
|| form.language !== (s.language ?? '')
|| form.timezone !== (s.timezone ?? '')
@@ -85,7 +85,7 @@
v-else-if="advancedFields.length === 0"
class="text-xs text-muted-foreground"
>
{{ mode === 'transcription' ? $t('transcription.noCapabilities') : $t('speech.noCapabilities') }}
{{ $t('speech.noCapabilities') }}
</div>
<div
@@ -97,7 +97,7 @@
class="flex w-full items-center justify-between px-3 py-2 text-left text-xs font-medium"
@click="showAdvanced = !showAdvanced"
>
<span>{{ mode === 'transcription' ? $t('transcription.advanced.title') : $t('speech.advanced.title') }}</span>
<span>{{ $t('speech.advanced.title') }}</span>
<component
:is="showAdvanced ? ChevronUp : ChevronDown"
class="size-3 text-muted-foreground"
@@ -108,7 +108,7 @@
class="space-y-4 border-t border-border px-3 py-3"
>
<p class="text-xs text-muted-foreground">
{{ mode === 'transcription' ? $t('transcription.advanced.description') : $t('speech.advanced.description') }}
{{ $t('speech.advanced.description') }}
</p>
<section
v-for="field in advancedFields"
@@ -195,12 +195,9 @@
<div class="space-y-3">
<h4 class="text-xs font-medium">
{{ mode === 'transcription' ? $t('transcription.test.title') : $t('speech.test.title') }}
{{ $t('speech.test.title') }}
</h4>
<div
v-if="mode === 'synthesis'"
class="relative"
>
<div class="relative">
<Textarea
v-model="testText"
:placeholder="$t('speech.test.placeholder')"
@@ -212,36 +209,17 @@
{{ testText.length }}/{{ maxTestTextLen }}
</span>
</div>
<div
v-else
class="space-y-2"
>
<Input
type="file"
accept="audio/*"
@change="handleFileChange"
/>
<p
v-if="selectedFileName"
class="text-xs text-muted-foreground"
>
{{ selectedFileName }}
</p>
</div>
<div class="flex items-center gap-3">
<LoadingButton
type="button"
variant="outline"
size="sm"
:loading="testLoading"
:disabled="mode === 'synthesis' ? (!testText.trim() || testText.length > maxTestTextLen) : !selectedFile"
:disabled="!testText.trim() || testText.length > maxTestTextLen"
@click="handleTest"
>
<Play
v-if="mode === 'synthesis'"
class="mr-1.5"
/>
{{ mode === 'transcription' ? $t('transcription.test.run') : $t('speech.test.generate') }}
<Play class="mr-1.5" />
{{ $t('speech.test.generate') }}
</LoadingButton>
<span
v-if="testError"
@@ -251,7 +229,7 @@
</span>
</div>
<div
v-if="mode === 'synthesis' && audioUrl"
v-if="audioUrl"
class="rounded-md border border-border bg-muted/30 p-3"
>
<audio
@@ -261,20 +239,6 @@
class="w-full"
/>
</div>
<div
v-if="mode === 'transcription' && transcriptionText"
class="rounded-md border border-border bg-muted/30 p-3 space-y-2"
>
<p class="text-sm whitespace-pre-wrap wrap-break-word">
{{ transcriptionText }}
</p>
<p
v-if="transcriptionLanguage"
class="text-xs text-muted-foreground"
>
{{ transcriptionLanguage }}
</p>
</div>
</div>
<Separator class="my-3" />
@@ -332,8 +296,7 @@ const props = defineProps<{
modelName: string
config: Record<string, unknown>
schema: SpeechConfigSchema | null
mode?: 'synthesis' | 'transcription'
onTest: (payload: string | File, config: Record<string, unknown>) => Promise<Blob | { text?: string, language?: string }>
onTest: (text: string, config: Record<string, unknown>) => Promise<Blob>
}>()
const emit = defineEmits<{
@@ -346,16 +309,11 @@ const visibleSecrets = reactive<Record<string, boolean>>({})
const saving = ref(false)
const showAdvanced = ref(false)
const testText = ref('')
const selectedFile = ref<File | null>(null)
const selectedFileName = ref('')
const testLoading = ref(false)
const testError = ref('')
const audioUrl = ref('')
const transcriptionText = ref('')
const transcriptionLanguage = ref('')
const audioEl = ref<HTMLAudioElement>()
const maxTestTextLen = 500
const mode = computed(() => props.mode ?? 'synthesis')
const orderedFields = computed(() => {
const fields = props.schema?.fields ?? []
@@ -390,11 +348,6 @@ function revokeAudio() {
}
}
function resetTranscription() {
transcriptionText.value = ''
transcriptionLanguage.value = ''
}
onBeforeUnmount(revokeAudio)
async function handleSaveConfig() {
@@ -407,39 +360,23 @@ async function handleSaveConfig() {
}
async function handleTest() {
if (mode.value === 'synthesis' && !testText.value.trim()) return
if (mode.value === 'transcription' && !selectedFile.value) return
if (!testText.value.trim()) return
testLoading.value = true
testError.value = ''
revokeAudio()
resetTranscription()
try {
const result = await props.onTest(mode.value === 'synthesis' ? testText.value : selectedFile.value as File, buildConfig())
const blob = await props.onTest(testText.value, buildConfig())
if (mode.value === 'synthesis') {
const blob = result as Blob
audioUrl.value = URL.createObjectURL(blob)
await new Promise<void>((resolve) => setTimeout(resolve, 50))
audioEl.value?.play()
} else {
const payload = result as { text?: string, language?: string }
transcriptionText.value = payload.text ?? ''
transcriptionLanguage.value = payload.language ?? ''
}
audioUrl.value = URL.createObjectURL(blob)
await new Promise<void>((resolve) => setTimeout(resolve, 50))
audioEl.value?.play()
} catch (error: unknown) {
const msg = error instanceof Error ? error.message : t(mode.value === 'transcription' ? 'transcription.test.failed' : 'speech.test.failed')
const msg = error instanceof Error ? error.message : t('speech.test.failed')
testError.value = msg
toast.error(msg)
} finally {
testLoading.value = false
}
}
function handleFileChange(event: Event) {
const input = event.target as HTMLInputElement
const file = input.files?.[0] ?? null
selectedFile.value = file
selectedFileName.value = file?.name ?? ''
}
</script>
@@ -138,29 +138,18 @@
<section>
<div class="flex justify-between items-center mb-4">
<h3 class="text-xs font-medium">
{{ $t('speech.synthesis.models') }}
{{ $t('speech.models') }}
</h3>
<div
<LoadingButton
v-if="curProviderId"
class="flex items-center gap-2"
type="button"
variant="outline"
size="sm"
:loading="importLoading"
@click="handleImportModels"
>
<LoadingButton
type="button"
variant="outline"
size="sm"
:loading="importLoading"
@click="handleImportModels"
>
{{ $t('speech.importModels') }}
</LoadingButton>
<CreateModel
:id="curProviderId"
default-type="speech"
hide-type
:type-options="speechTypeOptions"
:invalidate-keys="['speech-provider-models', 'speech-models']"
/>
</div>
{{ $t('speech.importModels') }}
</LoadingButton>
</div>
<div
@@ -202,7 +191,7 @@
:model-name="model.model_id ?? ''"
:config="model.config || {}"
:schema="getModelSchema(model.model_id ?? '')"
:on-test="(text, cfg) => handleTestModel(model.id ?? '', text as string, cfg)"
:on-test="(text, cfg) => handleTestModel(model.id ?? '', text, cfg)"
@save="(cfg) => handleSaveModel(model.id ?? '', cfg)"
/>
</div>
@@ -229,11 +218,10 @@ import { computed, inject, reactive, ref, watch } from 'vue'
import { toast } from 'vue-sonner'
import { useI18n } from 'vue-i18n'
import { useQuery, useQueryCache } from '@pinia/colada'
import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvidersMeta, postSpeechProvidersByIdImportModels, putProvidersById } from '@memohai/sdk'
import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvidersMeta, postSpeechProvidersByIdImportModels, putModelsById, putProvidersById } from '@memohai/sdk'
import type { TtsSpeechModelResponse, TtsSpeechProviderResponse } from '@memohai/sdk'
import LoadingButton from '@/components/loading-button/index.vue'
import ProviderIcon from '@/components/provider-icon/index.vue'
import CreateModel from '@/components/create-model/index.vue'
interface SpeechFieldSchema {
key: string
@@ -268,8 +256,6 @@ interface SpeechProviderMeta {
config_schema?: SpeechConfigSchema
default_model?: string
models?: SpeechModelMeta[]
default_synthesis_model?: string
synthesis_models?: SpeechModelMeta[]
}
function getInitials(name: string | undefined) {
@@ -288,9 +274,6 @@ const enableLoading = ref(false)
const saveLoading = ref(false)
const importLoading = ref(false)
const queryCache = useQueryCache()
const speechTypeOptions = [
{ value: 'speech', label: 'Speech' },
]
const { data: providerDetail } = useQuery({
key: () => ['speech-provider-detail', curProviderId.value],
@@ -314,7 +297,7 @@ const { data: metaList } = useQuery({
const currentMeta = computed(() => {
if (!metaList.value || !curProvider.value?.client_type) return null
return (metaList.value as SpeechProviderMeta[]).find(m => m.provider === curProvider.value?.client_type) ?? null
return (metaList.value as SpeechProviderMeta[]).find((m) => m.provider === curProvider.value?.client_type) ?? null
})
const orderedProviderFields = computed(() => {
@@ -334,7 +317,9 @@ const { data: providerSpeechModels } = useQuery({
},
})
const providerModels = computed(() => ((providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? []))
const providerModels = computed(() => {
return (providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? []
})
watch(() => providerDetail.value, (provider) => {
providerName.value = provider?.name ?? curProvider.value?.name ?? ''
@@ -343,11 +328,12 @@ watch(() => providerDetail.value, (provider) => {
}, { immediate: true, deep: true })
function getModelMeta(modelID: string): SpeechModelMeta | null {
const models = currentMeta.value?.synthesis_models ?? currentMeta.value?.models ?? []
const models = currentMeta.value?.models ?? []
const exact = models.find(m => m.id === modelID)
if (exact) return exact
const defaultModel = currentMeta.value?.default_synthesis_model ?? currentMeta.value?.default_model
if (defaultModel) return models.find(m => m.id === defaultModel) ?? null
if (currentMeta.value?.default_model) {
return models.find(m => m.id === currentMeta.value?.default_model) ?? null
}
return models[0] ?? null
}
@@ -412,23 +398,20 @@ async function handleSaveProvider() {
}
async function handleSaveModel(modelId: string, config: Record<string, unknown>) {
const model = providerModels.value.find(item => item.id === modelId)
const model = providerModels.value.find((item) => item.id === modelId)
if (!model) return
try {
const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
const token = localStorage.getItem('token')
const resp = await fetch(`${apiBase}/speech-models/${modelId}`, {
method: 'PUT',
headers: {
'Content-Type': 'application/json',
...(token ? { Authorization: `Bearer ${token}` } : {}),
},
body: JSON.stringify({
await putModelsById({
path: { id: modelId },
body: {
model_id: model.model_id,
name: model.name ?? model.model_id,
provider_id: model.provider_id,
type: 'speech',
config,
}),
},
throwOnError: true,
})
if (!resp.ok) throw new Error(await resp.text())
toast.success(t('speech.saveSuccess'))
queryCache.invalidateQueries({ key: ['speech-provider-models', curProviderId.value] })
queryCache.invalidateQueries({ key: ['speech-models'] })
-126
View File
@@ -1,126 +0,0 @@
<script setup lang="ts">
import { computed, ref, provide, watch } from 'vue'
import { useQuery } from '@pinia/colada'
import {
ScrollArea,
SidebarMenu,
SidebarMenuButton,
SidebarMenuItem,
Toggle,
Empty,
EmptyDescription,
EmptyHeader,
EmptyMedia,
EmptyTitle,
} from '@memohai/ui'
import { getTranscriptionProviders } from '@memohai/sdk'
import type { AudioSpeechProviderResponse } from '@memohai/sdk'
import ProviderSetting from './provider-setting.vue'
import { AudioLines } from 'lucide-vue-next'
import MasterDetailSidebarLayout from '@/components/master-detail-sidebar-layout/index.vue'
import ProviderIcon from '@/components/provider-icon/index.vue'
function getInitials(name: string | undefined) {
const label = name?.trim() ?? ''
return label ? label.slice(0, 2).toUpperCase() : '?'
}
const { data: providerData } = useQuery({
key: () => ['transcription-providers'],
query: async () => {
const { data } = await getTranscriptionProviders({ throwOnError: true })
return (data ?? []) as AudioSpeechProviderResponse[]
},
})
const curProvider = ref<AudioSpeechProviderResponse>()
provide('curTranscriptionProvider', curProvider)
const selectProvider = (name: string) => computed(() => curProvider.value?.name === name)
const filteredProviders = computed(() => {
if (!Array.isArray(providerData.value)) return []
return [...providerData.value].sort((a, b) => Number(b.enable !== false) - Number(a.enable !== false))
})
watch(filteredProviders, (list) => {
if (!list || list.length === 0) {
curProvider.value = { id: '' }
return
}
const currentId = curProvider.value?.id
if (currentId) {
const stillExists = list.find(p => p.id === currentId)
if (stillExists) {
curProvider.value = stillExists
return
}
}
curProvider.value = list[0]
}, { immediate: true })
</script>
<template>
<MasterDetailSidebarLayout>
<template #sidebar-content>
<SidebarMenu
v-for="item in filteredProviders"
:key="item.id"
>
<SidebarMenuItem>
<SidebarMenuButton
as-child
class="justify-start py-5! px-4"
>
<Toggle
:class="['py-4 border', curProvider?.id === item.id ? 'border-border' : 'border-transparent']"
:model-value="selectProvider(item.name ?? '').value"
@update:model-value="(isSelect) => { if (isSelect) curProvider = item }"
>
<span class="relative shrink-0">
<span class="flex size-7 items-center justify-center rounded-full bg-muted">
<ProviderIcon
v-if="item.icon"
:icon="item.icon"
size="1.25em"
/>
<span
v-else
class="text-xs font-medium text-muted-foreground"
>
{{ getInitials(item.name) }}
</span>
</span>
<span
v-if="item.enable !== false"
class="absolute -bottom-0.5 -right-0.5 size-2.5 rounded-full bg-green-500 ring-2 ring-background"
/>
</span>
<span class="truncate">{{ item.name }}</span>
</Toggle>
</SidebarMenuButton>
</SidebarMenuItem>
</SidebarMenu>
</template>
<template #detail>
<ScrollArea
v-if="curProvider?.id"
class="max-h-full h-full"
>
<ProviderSetting />
</ScrollArea>
<Empty
v-else
class="h-full flex justify-center items-center"
>
<EmptyHeader>
<EmptyMedia variant="icon">
<AudioLines />
</EmptyMedia>
</EmptyHeader>
<EmptyTitle>{{ $t('transcription.emptyTitle') }}</EmptyTitle>
<EmptyDescription>{{ $t('transcription.emptyDescription') }}</EmptyDescription>
</Empty>
</template>
</MasterDetailSidebarLayout>
</template>
@@ -1,480 +0,0 @@
<template>
<div class="p-4">
<section class="flex items-center gap-3">
<span class="flex size-10 shrink-0 items-center justify-center rounded-full bg-muted">
<ProviderIcon
v-if="curProvider?.icon"
:icon="curProvider.icon"
size="1.5em"
/>
<span
v-else
class="text-xs font-medium text-muted-foreground"
>
{{ getInitials(curProvider?.name) }}
</span>
</span>
<div class="min-w-0">
<h2 class="text-sm font-semibold truncate">
{{ curProvider?.name }}
</h2>
<p class="text-xs text-muted-foreground">
{{ currentMeta?.display_name ?? curProvider?.client_type }}
</p>
</div>
<div class="ml-auto flex items-center gap-2">
<span class="text-xs text-muted-foreground">
{{ $t('common.enable') }}
</span>
<Switch
:model-value="curProvider?.enable ?? false"
:disabled="!curProvider?.id || enableLoading"
@update:model-value="handleToggleEnable"
/>
</div>
</section>
<Separator class="mt-4 mb-6" />
<form
class="space-y-4"
@submit.prevent="handleSaveProvider"
>
<section class="space-y-2">
<Label for="transcription-provider-name">{{ $t('common.name') }}</Label>
<Input
id="transcription-provider-name"
v-model="providerName"
type="text"
:placeholder="$t('common.namePlaceholder')"
/>
</section>
<section
v-for="field in orderedProviderFields"
:key="field.key"
class="space-y-2"
>
<Label :for="field.type === 'bool' || field.type === 'enum' ? undefined : `transcription-provider-${field.key}`">
{{ field.title || field.key }}
</Label>
<p
v-if="field.description"
class="text-xs text-muted-foreground"
>
{{ field.description }}
</p>
<div
v-if="field.type === 'secret'"
class="relative"
>
<Input
:id="`transcription-provider-${field.key}`"
v-model="providerConfig[field.key] as string"
:type="visibleSecrets[field.key] ? 'text' : 'password'"
/>
<button
type="button"
class="absolute right-2 top-1/2 -translate-y-1/2 text-muted-foreground hover:text-foreground"
@click="visibleSecrets[field.key] = !visibleSecrets[field.key]"
>
<component
:is="visibleSecrets[field.key] ? EyeOff : Eye"
class="size-3.5"
/>
</button>
</div>
<Switch
v-else-if="field.type === 'bool'"
:model-value="!!providerConfig[field.key]"
@update:model-value="(val) => providerConfig[field.key] = !!val"
/>
<Input
v-else-if="field.type === 'number'"
:id="`transcription-provider-${field.key}`"
v-model.number="providerConfig[field.key] as number"
type="number"
/>
<Select
v-else-if="field.type === 'enum' && field.enum"
:model-value="String(providerConfig[field.key] ?? '')"
@update:model-value="(val) => providerConfig[field.key] = val"
>
<SelectTrigger>
<SelectValue :placeholder="field.title || field.key" />
</SelectTrigger>
<SelectContent>
<SelectItem
v-for="opt in field.enum"
:key="opt"
:value="opt"
>
{{ opt }}
</SelectItem>
</SelectContent>
</Select>
<Input
v-else
:id="`transcription-provider-${field.key}`"
v-model="providerConfig[field.key] as string"
type="text"
/>
</section>
<div class="flex justify-end">
<LoadingButton
type="submit"
:loading="saveLoading"
>
{{ $t('provider.saveChanges') }}
</LoadingButton>
</div>
</form>
<Separator class="mt-6 mb-6" />
<section>
<div class="flex justify-between items-center mb-4">
<h3 class="text-xs font-medium">
{{ $t('transcription.models') }}
</h3>
<div
v-if="curProviderId"
class="flex items-center gap-2"
>
<LoadingButton
type="button"
variant="outline"
size="sm"
:loading="importLoading"
@click="handleImportModels"
>
{{ $t('transcription.importModels') }}
</LoadingButton>
<CreateModel
:id="curProviderId"
default-type="transcription"
hide-type
:type-options="transcriptionTypeOptions"
:invalidate-keys="['transcription-provider-models', 'transcription-models']"
/>
</div>
</div>
<div
v-if="providerModels.length === 0"
class="text-xs text-muted-foreground py-4 text-center"
>
{{ $t('transcription.noModels') }}
</div>
<div
v-for="model in providerModels"
:key="model.id"
class="border border-border rounded-lg mb-4"
>
<button
type="button"
class="w-full flex items-center justify-between p-3 text-left hover:bg-accent/50 rounded-t-lg transition-colors"
@click="toggleModel(model.id ?? '')"
>
<div>
<span class="text-xs font-medium">{{ model.name || model.model_id }}</span>
<span
v-if="model.name"
class="text-xs text-muted-foreground ml-2"
>
{{ model.model_id }}
</span>
</div>
<component
:is="expandedModelId === model.id ? ChevronUp : ChevronDown"
class="size-3 text-muted-foreground"
/>
</button>
<div
v-if="expandedModelId === model.id"
class="px-3 pb-3 space-y-4 border-t border-border pt-3"
>
<ModelConfigEditor
:model-id="model.id ?? ''"
:model-name="model.model_id ?? ''"
:config="model.config || {}"
:schema="getModelSchema(model.model_id ?? '')"
mode="transcription"
:on-test="(file, cfg) => handleTestModel(model.id ?? '', file as File, cfg)"
@save="(cfg) => handleSaveModel(model.id ?? '', cfg)"
/>
</div>
</div>
</section>
</div>
</template>
<script setup lang="ts">
import { computed, inject, reactive, ref, watch } from 'vue'
import { useQuery, useQueryCache } from '@pinia/colada'
import { toast } from 'vue-sonner'
import { useI18n } from 'vue-i18n'
import {
getTranscriptionProvidersById,
getTranscriptionProvidersMeta,
getTranscriptionProvidersByIdModels,
postTranscriptionProvidersByIdImportModels,
postTranscriptionModelsByIdTest,
putProvidersById,
putTranscriptionModelsById,
} from '@memohai/sdk'
import type {
AudioProviderMetaResponse,
AudioSpeechProviderResponse,
AudioTestTranscriptionResponse,
AudioTranscriptionModelResponse,
} from '@memohai/sdk'
import { ChevronDown, ChevronUp, Eye, EyeOff } from 'lucide-vue-next'
import { Input, Label, Select, SelectContent, SelectItem, SelectTrigger, SelectValue, Separator, Switch } from '@memohai/ui'
import ProviderIcon from '@/components/provider-icon/index.vue'
import LoadingButton from '@/components/loading-button/index.vue'
import ModelConfigEditor from '@/pages/speech/components/model-config-editor.vue'
import CreateModel from '@/components/create-model/index.vue'
interface FieldSchema { key: string, type: string, title?: string, description?: string, enum?: string[], order?: number }
interface ConfigSchema { fields?: FieldSchema[] }
interface ModelMeta { id: string, name: string, config_schema?: ConfigSchema, capabilities?: { config_schema?: ConfigSchema } }
interface ProviderMeta {
provider: string
display_name?: string
config_schema?: ConfigSchema
default_transcription_model?: string
transcription_models?: ModelMeta[]
models?: ModelMeta[]
}
function getInitials(name: string | undefined) {
const label = name?.trim() ?? ''
return label ? label.slice(0, 2).toUpperCase() : '?'
}
function normalizeConfigSchema(schema?: AudioProviderMetaResponse['config_schema']): ConfigSchema | undefined {
if (!schema) return undefined
const fields: FieldSchema[] = []
for (const field of schema.fields ?? []) {
if (!field?.key || !field.type) continue
fields.push({
key: field.key,
type: field.type,
title: field.title,
description: field.description,
enum: field.enum,
order: field.order,
})
}
return { fields }
}
function normalizeModelMeta(model: NonNullable<AudioProviderMetaResponse['models']>[number]): ModelMeta | null {
if (!model?.id) return null
return {
id: model.id,
name: model.name ?? model.id,
config_schema: normalizeConfigSchema(model.config_schema),
capabilities: model.capabilities
? { config_schema: normalizeConfigSchema(model.capabilities.config_schema) }
: undefined,
}
}
function normalizeProviderMeta(meta: AudioProviderMetaResponse): ProviderMeta {
return {
provider: meta.provider ?? '',
display_name: meta.display_name,
config_schema: normalizeConfigSchema(meta.config_schema),
default_transcription_model: meta.default_transcription_model,
transcription_models: (meta.transcription_models ?? [])
.map(normalizeModelMeta)
.filter((model): model is ModelMeta => model !== null),
models: (meta.models ?? [])
.map(normalizeModelMeta)
.filter((model): model is ModelMeta => model !== null),
}
}
const { t } = useI18n()
const curProvider = inject('curTranscriptionProvider', ref<AudioSpeechProviderResponse>())
const curProviderId = computed(() => curProvider.value?.id)
const providerName = ref('')
const providerConfig = reactive<Record<string, unknown>>({})
const visibleSecrets = reactive<Record<string, boolean>>({})
const expandedModelId = ref('')
const enableLoading = ref(false)
const saveLoading = ref(false)
const importLoading = ref(false)
const queryCache = useQueryCache()
const transcriptionTypeOptions = [
{ value: 'transcription', label: 'Transcription' },
]
const { data: providerDetail } = useQuery({
key: () => ['transcription-provider-detail', curProviderId.value ?? ''],
query: async () => {
if (!curProviderId.value) return null
const { data } = await getTranscriptionProvidersById({
path: { id: curProviderId.value },
throwOnError: true,
})
return (data ?? null) as AudioSpeechProviderResponse | null
},
})
const { data: metaList } = useQuery({
key: () => ['transcription-providers-meta'],
query: async () => {
const { data } = await getTranscriptionProvidersMeta({ throwOnError: true })
return (data ?? []).map(normalizeProviderMeta)
},
})
const currentMeta = computed(() => (metaList.value ?? []).find(m => m.provider === curProvider.value?.client_type) ?? null)
const orderedProviderFields = computed(() => [...(currentMeta.value?.config_schema?.fields ?? [])].sort((a, b) => (a.order ?? 0) - (b.order ?? 0)))
const { data: providerModelData } = useQuery({
key: () => ['transcription-provider-models', curProviderId.value ?? ''],
query: async () => {
if (!curProviderId.value) return []
const { data } = await getTranscriptionProvidersByIdModels({
path: { id: curProviderId.value },
throwOnError: true,
})
return (data ?? []) as AudioTranscriptionModelResponse[]
},
})
const providerModels = computed(() => providerModelData.value ?? [])
watch(() => providerDetail.value, (provider) => {
providerName.value = provider?.name ?? curProvider.value?.name ?? ''
Object.keys(providerConfig).forEach((key) => delete providerConfig[key])
Object.assign(providerConfig, { ...(provider?.config ?? {}) })
}, { immediate: true, deep: true })
function getModelSchema(modelID: string): ConfigSchema | null {
const models = currentMeta.value?.transcription_models ?? currentMeta.value?.models ?? []
const exact = models.find(m => m.id === modelID)
const fallback = exact ?? models.find(m => m.id === currentMeta.value?.default_transcription_model) ?? models[0]
return fallback?.config_schema ?? fallback?.capabilities?.config_schema ?? null
}
function toggleModel(id: string) {
expandedModelId.value = expandedModelId.value === id ? '' : id
}
async function handleToggleEnable(value: boolean) {
if (!curProviderId.value || !curProvider.value?.client_type) return
const prev = curProvider.value.enable ?? false
curProvider.value = { ...curProvider.value, enable: value }
enableLoading.value = true
try {
await putProvidersById({
path: { id: curProviderId.value },
body: {
name: providerName.value.trim() || curProvider.value.name || '',
client_type: curProvider.value.client_type,
enable: value,
config: sanitizeConfig(providerConfig),
},
throwOnError: true,
})
queryCache.invalidateQueries({ key: ['transcription-providers'] })
queryCache.invalidateQueries({ key: ['transcription-provider-detail', curProviderId.value ?? ''] })
} catch {
curProvider.value = { ...curProvider.value, enable: prev }
toast.error(t('common.saveFailed'))
} finally {
enableLoading.value = false
}
}
async function handleSaveProvider() {
if (!curProviderId.value || !curProvider.value?.client_type) return
saveLoading.value = true
try {
await putProvidersById({
path: { id: curProviderId.value },
body: {
name: providerName.value.trim() || curProvider.value.name || '',
client_type: curProvider.value.client_type,
enable: curProvider.value.enable,
config: sanitizeConfig(providerConfig),
},
throwOnError: true,
})
toast.success(t('transcription.saveSuccess'))
queryCache.invalidateQueries({ key: ['transcription-providers'] })
queryCache.invalidateQueries({ key: ['transcription-provider-detail', curProviderId.value ?? ''] })
} catch {
toast.error(t('common.saveFailed'))
} finally {
saveLoading.value = false
}
}
async function handleSaveModel(modelId: string, config: Record<string, unknown>) {
const model = providerModels.value.find(item => item.id === modelId)
if (!model) return
try {
await putTranscriptionModelsById({
path: { id: modelId },
body: { name: model.name ?? model.model_id ?? modelId, config },
throwOnError: true,
})
toast.success(t('transcription.saveSuccess'))
queryCache.invalidateQueries({ key: ['transcription-provider-models', curProviderId.value ?? ''] })
queryCache.invalidateQueries({ key: ['transcription-models'] })
} catch {
toast.error(t('common.saveFailed'))
}
}
async function handleImportModels() {
if (!curProviderId.value) return
importLoading.value = true
try {
const { data } = await postTranscriptionProvidersByIdImportModels({
path: { id: curProviderId.value },
throwOnError: true,
})
const payload = (data ?? {}) as { created?: number, skipped?: number }
toast.success(t('transcription.importSuccess', {
created: payload.created ?? 0,
skipped: payload.skipped ?? 0,
}))
queryCache.invalidateQueries({ key: ['transcription-provider-models', curProviderId.value ?? ''] })
queryCache.invalidateQueries({ key: ['transcription-models'] })
queryCache.invalidateQueries({ key: ['transcription-providers-meta'] })
} catch {
toast.error(t('transcription.importFailed'))
} finally {
importLoading.value = false
}
}
async function handleTestModel(modelId: string, file: File, config: Record<string, unknown>) {
const { data } = await postTranscriptionModelsByIdTest({
path: { id: modelId },
body: {
file,
config: JSON.stringify(config),
},
throwOnError: true,
})
return (data ?? {}) as AudioTestTranscriptionResponse
}
function sanitizeConfig(input: Record<string, unknown>) {
const result: Record<string, unknown> = {}
for (const [key, value] of Object.entries(input)) {
if (value === '' || value == null) continue
result[key] = value
}
return result
}
</script>
-8
View File
@@ -89,14 +89,6 @@ const routes = [
breadcrumb: i18nRef('sidebar.speech'),
},
},
{
name: 'transcription',
path: 'transcription',
component: () => import('@/pages/transcription/index.vue'),
meta: {
breadcrumb: i18nRef('sidebar.transcription'),
},
},
{
name: 'email',
path: 'email',
+17 -49
View File
@@ -23,7 +23,6 @@ import (
agentpkg "github.com/memohai/memoh/internal/agent"
"github.com/memohai/memoh/internal/agent/background"
agenttools "github.com/memohai/memoh/internal/agent/tools"
audiopkg "github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/bind"
"github.com/memohai/memoh/internal/boot"
"github.com/memohai/memoh/internal/bots"
@@ -88,6 +87,7 @@ import (
"github.com/memohai/memoh/internal/storage/providers/containerfs"
"github.com/memohai/memoh/internal/storage/providers/fallback"
"github.com/memohai/memoh/internal/storage/providers/localfs"
ttspkg "github.com/memohai/memoh/internal/tts"
"github.com/memohai/memoh/internal/version"
"github.com/memohai/memoh/internal/workspace"
)
@@ -331,7 +331,7 @@ func provideChannelRouter(
policyService *policy.Service,
bindService *bind.Service,
mediaService *media.Service,
audioService *audiopkg.Service,
ttsService *ttspkg.Service,
settingsService *settings.Service,
scheduleService *schedule.Service,
mcpConnService *mcp.ConnectionService,
@@ -372,8 +372,7 @@ func provideChannelRouter(
processor.SetMediaService(mediaService)
processor.SetStreamObserver(local.NewRouteHubBroadcaster(hub))
processor.SetDispatcher(inbound.NewRouteDispatcher(log))
processor.SetSpeechService(audioService, &settingsSpeechModelResolver{settings: settingsService})
processor.SetTranscriptionService(&settingsTranscriptionAdapter{audio: audioService}, &settingsTranscriptionModelResolver{settings: settingsService})
processor.SetTtsService(ttsService, &settingsTtsModelResolver{settings: settingsService})
cmdHandler := command.NewHandler(
log,
&command.BotMemberRoleAdapter{BotService: botService},
@@ -450,7 +449,7 @@ func provideBackgroundManager(log *slog.Logger) *background.Manager {
return background.New(log)
}
func provideToolProviders(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *workspace.Manager, mediaService *media.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, mcpConnService *mcp.ConnectionService, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, audioService *audiopkg.Service, sessionService *sessionpkg.Service, bgManager *background.Manager) []agenttools.ToolProvider {
func provideToolProviders(log *slog.Logger, cfg config.Config, channelManager *channel.Manager, registry *channel.Registry, routeService *route.DBService, scheduleService *schedule.Service, settingsService *settings.Service, searchProviderService *searchproviders.Service, manager *workspace.Manager, mediaService *media.Service, memoryRegistry *memprovider.Registry, emailService *emailpkg.Service, emailManager *emailpkg.Manager, fedGateway *handlers.MCPFederationGateway, mcpConnService *mcp.ConnectionService, modelsService *models.Service, browserContextService *browsercontexts.Service, queries *dbsqlc.Queries, ttsService *ttspkg.Service, sessionService *sessionpkg.Service, bgManager *background.Manager) []agenttools.ToolProvider {
var assetResolver messaging.AssetResolver
if mediaService != nil {
assetResolver = &mediaAssetResolverAdapter{media: mediaService}
@@ -468,8 +467,7 @@ func provideToolProviders(log *slog.Logger, cfg config.Config, channelManager *c
agenttools.NewSpawnProvider(log, settingsService, modelsService, queries, sessionService),
agenttools.NewSkillProvider(log),
agenttools.NewBrowserProvider(log, settingsService, browserContextService, manager, cfg.BrowserGateway),
agenttools.NewTTSProvider(log, settingsService, audioService, channelManager, registry),
agenttools.NewTranscriptionProvider(log, settingsService, audioService, mediaService),
agenttools.NewTTSProvider(log, settingsService, ttsService, channelManager, registry),
agenttools.NewImageGenProvider(log, settingsService, modelsService, queries, manager, config.DefaultDataMount),
agenttools.NewFederationProvider(log, fedSource),
agenttools.NewHistoryProvider(log, sessionService, queries),
@@ -513,23 +511,23 @@ func provideUsersHandler(log *slog.Logger, accountService *accounts.Service, ide
return handlers.NewUsersHandler(log, accountService, identityService, botService, routeService, channelStore, channelLifecycle, channelManager, registry)
}
func provideWebHandler(channelManager *channel.Manager, channelStore *channel.Store, chatService *conversation.Service, hub *local.RouteHub, botService *bots.Service, accountService *accounts.Service, resolver *flow.Resolver, mediaService *media.Service, audioService *audiopkg.Service, settingsService *settings.Service) *handlers.LocalChannelHandler {
func provideWebHandler(channelManager *channel.Manager, channelStore *channel.Store, chatService *conversation.Service, hub *local.RouteHub, botService *bots.Service, accountService *accounts.Service, resolver *flow.Resolver, mediaService *media.Service, ttsService *ttspkg.Service, settingsService *settings.Service) *handlers.LocalChannelHandler {
h := handlers.NewLocalChannelHandler(local.WebType, channelManager, channelStore, chatService, hub, botService, accountService)
h.SetResolver(resolver)
h.SetMediaService(mediaService)
h.SetSpeechService(audioService, &settingsSpeechModelResolver{settings: settingsService})
h.SetTtsService(ttsService, &settingsTtsModelResolver{settings: settingsService})
return h
}
func provideAudioRegistry() *audiopkg.Registry {
return audiopkg.NewRegistry()
func provideTtsRegistry() *ttspkg.Registry {
return ttspkg.NewRegistry()
}
func provideAudioTempStore() (*audiopkg.TempStore, error) {
return audiopkg.NewTempStore(os.TempDir())
func provideTtsTempStore() (*ttspkg.TempStore, error) {
return ttspkg.NewTempStore(os.TempDir())
}
func startAudioTempStoreCleanup(lc fx.Lifecycle, store *audiopkg.TempStore) {
func startTtsTempStoreCleanup(lc fx.Lifecycle, store *ttspkg.TempStore) {
done := make(chan struct{})
lc.Append(fx.Hook{
OnStart: func(_ context.Context) error {
@@ -585,11 +583,11 @@ func (a *sessionEnsurerAdapter) CreateNewSession(ctx context.Context, botID, rou
return inbound.SessionResult{ID: sess.ID, Type: sess.Type}, nil
}
type settingsSpeechModelResolver struct {
type settingsTtsModelResolver struct {
settings *settings.Service
}
func (r *settingsSpeechModelResolver) ResolveSpeechModelID(ctx context.Context, botID string) (string, error) {
func (r *settingsTtsModelResolver) ResolveTtsModelID(ctx context.Context, botID string) (string, error) {
s, err := r.settings.GetBot(ctx, botID)
if err != nil {
return "", err
@@ -597,36 +595,6 @@ func (r *settingsSpeechModelResolver) ResolveSpeechModelID(ctx context.Context,
return s.TtsModelID, nil
}
type settingsTranscriptionModelResolver struct {
settings *settings.Service
}
func (r *settingsTranscriptionModelResolver) ResolveTranscriptionModelID(ctx context.Context, botID string) (string, error) {
s, err := r.settings.GetBot(ctx, botID)
if err != nil {
return "", err
}
return s.TranscriptionModelID, nil
}
type settingsTranscriptionAdapter struct {
audio *audiopkg.Service
}
type inboundTranscriptionResult struct {
text string
}
func (r inboundTranscriptionResult) GetText() string { return r.text }
func (a *settingsTranscriptionAdapter) Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (inbound.TranscriptionResult, error) {
result, err := a.audio.Transcribe(ctx, modelID, audio, filename, contentType, overrideCfg)
if err != nil {
return nil, err
}
return inboundTranscriptionResult{text: result.Text}, nil
}
func provideEmailRegistry(log *slog.Logger, tokenStore *emailpkg.DBOAuthTokenStore) *emailpkg.Registry {
reg := emailpkg.NewRegistry()
reg.Register(emailgeneric.New(log))
@@ -716,11 +684,11 @@ func startRegistrySync(lc fx.Lifecycle, log *slog.Logger, cfg config.Config, que
})
}
func startAudioProviderBootstrap(lc fx.Lifecycle, log *slog.Logger, queries *dbsqlc.Queries, registry *audiopkg.Registry) {
func startSpeechProviderBootstrap(lc fx.Lifecycle, log *slog.Logger, queries *dbsqlc.Queries, registry *ttspkg.Registry) {
lc.Append(fx.Hook{
OnStart: func(ctx context.Context) error {
if err := audiopkg.SyncRegistry(ctx, log, queries, registry); err != nil {
log.Warn("audio registry bootstrap failed", slog.Any("error", err))
if err := ttspkg.SyncRegistry(ctx, log, queries, registry); err != nil {
log.Warn("speech registry bootstrap failed", slog.Any("error", err))
}
return nil
},
+8 -8
View File
@@ -8,7 +8,6 @@ import (
"github.com/memohai/memoh/internal/accounts"
"github.com/memohai/memoh/internal/acl"
audiopkg "github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/bind"
"github.com/memohai/memoh/internal/boot"
"github.com/memohai/memoh/internal/bots"
@@ -30,6 +29,7 @@ import (
"github.com/memohai/memoh/internal/schedule"
"github.com/memohai/memoh/internal/searchproviders"
"github.com/memohai/memoh/internal/settings"
ttspkg "github.com/memohai/memoh/internal/tts"
)
func runServe() {
@@ -63,9 +63,9 @@ func options() fx.Option {
identities.NewService,
bind.NewService,
event.NewHub,
provideAudioRegistry,
audiopkg.NewService,
provideAudioTempStore,
provideTtsRegistry,
ttspkg.NewService,
provideTtsTempStore,
emailpkg.NewDBOAuthTokenStore,
provideEmailRegistry,
emailpkg.NewService,
@@ -121,8 +121,8 @@ func options() fx.Option {
provideServerHandler(weixin.NewQRServerHandler),
provideServerHandler(provideUsersHandler),
provideServerHandler(handlers.NewMemoryProvidersHandler),
provideServerHandler(handlers.NewAudioHandler),
provideServerHandler(handlers.NewBotAudioHandler),
provideServerHandler(handlers.NewSpeechHandler),
provideServerHandler(handlers.NewBotTtsHandler),
provideServerHandler(handlers.NewEmailProvidersHandler),
provideServerHandler(handlers.NewEmailBindingsHandler),
provideServerHandler(handlers.NewEmailOutboxHandler),
@@ -141,7 +141,7 @@ func options() fx.Option {
fx.Invoke(
injectToolProviders,
startRegistrySync,
startAudioProviderBootstrap,
startSpeechProviderBootstrap,
startMemoryProviderBootstrap,
startSearchProviderBootstrap,
startScheduleService,
@@ -151,7 +151,7 @@ func options() fx.Option {
startEmailManager,
startContainerReconciliation,
startBackgroundTaskCleanup,
startAudioTempStoreCleanup,
startTtsTempStoreCleanup,
startServer,
),
fx.WithLogger(func(logger *slog.Logger) fxevent.Logger {
@@ -1,9 +0,0 @@
name: Deepgram Transcription
client_type: deepgram-transcription
icon: deepgram
base_url: https://api.deepgram.com
models:
- model_id: nova-3
name: Nova-3
type: transcription
@@ -1,9 +0,0 @@
name: ElevenLabs Transcription
client_type: elevenlabs-transcription
icon: elevenlabs
base_url: https://api.elevenlabs.io
models:
- model_id: scribe_v2
name: Scribe v2
type: transcription
-9
View File
@@ -1,9 +0,0 @@
name: Google Transcription
client_type: google-transcription
icon: google-color
base_url: https://generativelanguage.googleapis.com/v1beta
models:
- model_id: gemini-2.5-flash
name: Gemini 2.5 Flash
type: transcription
-9
View File
@@ -1,9 +0,0 @@
name: OpenAI Transcription
client_type: openai-transcription
icon: openai
base_url: https://api.openai.com/v1
models:
- model_id: gpt-4o-mini-transcribe
name: GPT-4o Mini Transcribe
type: transcription
@@ -1,9 +0,0 @@
name: OpenRouter Transcription
client_type: openrouter-transcription
icon: openrouter
base_url: https://openrouter.ai/api/v1
models:
- model_id: openai/gpt-4o-mini-transcribe
name: OpenRouter Transcription
type: transcription
+2 -9
View File
@@ -77,19 +77,13 @@ CREATE TABLE IF NOT EXISTS providers (
'github-copilot',
'edge-speech',
'openai-speech',
'openai-transcription',
'openrouter-speech',
'openrouter-transcription',
'elevenlabs-speech',
'elevenlabs-transcription',
'deepgram-speech',
'deepgram-transcription',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
'google-speech',
'google-transcription'
'microsoft-speech'
))
);
@@ -114,7 +108,7 @@ CREATE TABLE IF NOT EXISTS models (
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
CONSTRAINT models_provider_id_model_id_unique UNIQUE (provider_id, model_id),
CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech', 'transcription'))
CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech'))
);
CREATE TABLE IF NOT EXISTS model_variants (
@@ -176,7 +170,6 @@ CREATE TABLE IF NOT EXISTS bots (
image_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
discuss_probe_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
tts_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
transcription_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
browser_context_id UUID REFERENCES browser_contexts(id) ON DELETE SET NULL,
persist_full_tool_results BOOLEAN NOT NULL DEFAULT false,
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
@@ -1,33 +0,0 @@
-- 0069_add_transcription_models_and_speech_domain
-- Revert transcription model type and speech-domain expansion.
DELETE FROM models WHERE type = 'transcription';
DELETE FROM providers WHERE client_type = 'google-speech';
ALTER TABLE models
DROP CONSTRAINT IF EXISTS models_type_check;
ALTER TABLE models
ADD CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech'));
ALTER TABLE providers
DROP CONSTRAINT IF EXISTS providers_client_type_check;
ALTER TABLE providers
ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
'openai-responses',
'openai-completions',
'anthropic-messages',
'google-generative-ai',
'openai-codex',
'github-copilot',
'edge-speech',
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech'
));
@@ -1,31 +0,0 @@
-- 0069_add_transcription_models_and_speech_domain
-- Expand the speech domain to support transcription models and shared speech providers.
ALTER TABLE providers
DROP CONSTRAINT IF EXISTS providers_client_type_check;
ALTER TABLE providers
ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
'openai-responses',
'openai-completions',
'anthropic-messages',
'google-generative-ai',
'openai-codex',
'github-copilot',
'edge-speech',
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
'google-speech'
));
ALTER TABLE models
DROP CONSTRAINT IF EXISTS models_type_check;
ALTER TABLE models
ADD CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech', 'transcription'));
@@ -1,8 +0,0 @@
-- 0070_add_bot_transcription_model
-- Remove bots.transcription_model_id.
ALTER TABLE bots
DROP CONSTRAINT IF EXISTS bots_transcription_model_id_fkey;
ALTER TABLE bots
DROP COLUMN IF EXISTS transcription_model_id;
@@ -1,5 +0,0 @@
-- 0070_add_bot_transcription_model
-- Add bots.transcription_model_id for bot-level speech-to-text defaults.
ALTER TABLE bots
ADD COLUMN IF NOT EXISTS transcription_model_id UUID REFERENCES models(id) ON DELETE SET NULL;
@@ -1,33 +0,0 @@
-- 0071_split_transcription_providers
-- Remove dedicated transcription provider client types.
DELETE FROM providers
WHERE client_type IN (
'openai-transcription',
'openrouter-transcription',
'elevenlabs-transcription',
'deepgram-transcription',
'google-transcription'
);
ALTER TABLE providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
ALTER TABLE providers
ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
'openai-responses',
'openai-completions',
'anthropic-messages',
'google-generative-ai',
'openai-codex',
'github-copilot',
'edge-speech',
'openai-speech',
'openrouter-speech',
'elevenlabs-speech',
'deepgram-speech',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
'google-speech'
));
@@ -1,29 +0,0 @@
-- 0071_split_transcription_providers
-- Add dedicated transcription provider client types.
ALTER TABLE providers DROP CONSTRAINT IF EXISTS providers_client_type_check;
ALTER TABLE providers
ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
'openai-responses',
'openai-completions',
'anthropic-messages',
'google-generative-ai',
'openai-codex',
'github-copilot',
'edge-speech',
'openai-speech',
'openai-transcription',
'openrouter-speech',
'openrouter-transcription',
'elevenlabs-speech',
'elevenlabs-transcription',
'deepgram-speech',
'deepgram-transcription',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
'google-speech',
'google-transcription'
));
+11 -61
View File
@@ -16,27 +16,18 @@ SELECT * FROM providers WHERE id = sqlc.arg(id);
-- name: GetProviderByName :one
SELECT * FROM providers WHERE name = sqlc.arg(name);
-- name: GetProviderByClientType :one
SELECT * FROM providers WHERE client_type = sqlc.arg(client_type);
-- name: ListProviders :many
SELECT * FROM providers
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
'openai-transcription',
'openrouter-speech',
'openrouter-transcription',
'elevenlabs-speech',
'elevenlabs-transcription',
'deepgram-speech',
'deepgram-transcription',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
'google-speech',
'google-transcription'
'microsoft-speech'
)
ORDER BY created_at DESC;
@@ -62,19 +53,13 @@ FROM providers
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
'openai-transcription',
'openrouter-speech',
'openrouter-transcription',
'elevenlabs-speech',
'elevenlabs-transcription',
'deepgram-speech',
'deepgram-transcription',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
'google-speech',
'google-transcription'
'microsoft-speech'
);
-- name: CreateModel :one
@@ -101,7 +86,7 @@ ORDER BY created_at DESC;
-- name: ListModels :many
SELECT * FROM models
WHERE type NOT IN ('speech', 'transcription')
WHERE type != 'speech'
ORDER BY created_at DESC;
-- name: ListModelsByType :many
@@ -112,7 +97,7 @@ ORDER BY created_at DESC;
-- name: ListModelsByProviderID :many
SELECT * FROM models
WHERE provider_id = sqlc.arg(provider_id)
AND type NOT IN ('speech', 'transcription')
AND type != 'speech'
ORDER BY created_at DESC;
-- name: ListModelsByProviderIDAndType :many
@@ -151,15 +136,9 @@ DELETE FROM models
WHERE provider_id = sqlc.arg(provider_id)
AND model_id = sqlc.arg(model_id);
-- name: DeleteModelByProviderAndType :exec
DELETE FROM models
WHERE provider_id = sqlc.arg(provider_id)
AND model_id = sqlc.arg(model_id)
AND type = sqlc.arg(type);
-- name: CountModels :one
SELECT COUNT(*) FROM models
WHERE type NOT IN ('speech', 'transcription');
WHERE type != 'speech';
-- name: CountModelsByType :one
SELECT COUNT(*) FROM models WHERE type = sqlc.arg(type);
@@ -171,6 +150,11 @@ VALUES (sqlc.arg(name), sqlc.arg(client_type), sqlc.arg(icon), false, sqlc.arg(c
ON CONFLICT (name) DO UPDATE SET
icon = EXCLUDED.icon,
client_type = EXCLUDED.client_type,
config = CASE
WHEN providers.config->>'api_key' IS NOT NULL AND providers.config->>'api_key' != ''
THEN jsonb_set(EXCLUDED.config, '{api_key}', providers.config->'api_key')
ELSE EXCLUDED.config
END,
updated_at = now()
RETURNING *;
@@ -189,7 +173,7 @@ SELECT m.*
FROM models m
JOIN providers p ON m.provider_id = p.id
WHERE p.enable = true
AND m.type NOT IN ('speech', 'transcription')
AND m.type != 'speech'
ORDER BY m.created_at DESC;
-- name: ListEnabledModelsByType :many
@@ -247,17 +231,6 @@ WHERE client_type IN (
)
ORDER BY created_at DESC;
-- name: ListTranscriptionProviders :many
SELECT * FROM providers
WHERE client_type IN (
'openai-transcription',
'openrouter-transcription',
'elevenlabs-transcription',
'deepgram-transcription',
'google-transcription'
)
ORDER BY created_at DESC;
-- name: ListSpeechModels :many
SELECT m.*,
p.client_type AS provider_type
@@ -277,26 +250,3 @@ SELECT * FROM models
WHERE provider_id = sqlc.arg(provider_id)
AND model_id = sqlc.arg(model_id)
LIMIT 1;
-- name: GetTranscriptionModelWithProvider :one
SELECT
m.*,
p.client_type AS provider_type
FROM models m
JOIN providers p ON p.id = m.provider_id
WHERE m.id = sqlc.arg(id)
AND m.type = 'transcription';
-- name: ListTranscriptionModels :many
SELECT m.*,
p.client_type AS provider_type
FROM models m
JOIN providers p ON p.id = m.provider_id
WHERE m.type = 'transcription'
ORDER BY m.created_at DESC;
-- name: ListTranscriptionModelsByProviderID :many
SELECT * FROM models
WHERE provider_id = sqlc.arg(provider_id)
AND type = 'transcription'
ORDER BY created_at DESC;
+1 -7
View File
@@ -19,7 +19,6 @@ SELECT
memory_providers.id AS memory_provider_id,
image_models.id AS image_model_id,
tts_models.id AS tts_model_id,
transcription_models.id AS transcription_model_id,
browser_contexts.id AS browser_context_id,
bots.persist_full_tool_results
FROM bots
@@ -31,7 +30,6 @@ LEFT JOIN models AS image_models ON image_models.id = bots.image_model_id
LEFT JOIN search_providers ON search_providers.id = bots.search_provider_id
LEFT JOIN memory_providers ON memory_providers.id = bots.memory_provider_id
LEFT JOIN models AS tts_models ON tts_models.id = bots.tts_model_id
LEFT JOIN models AS transcription_models ON transcription_models.id = bots.transcription_model_id
LEFT JOIN browser_contexts ON browser_contexts.id = bots.browser_context_id
WHERE bots.id = $1;
@@ -56,12 +54,11 @@ WITH updated AS (
memory_provider_id = COALESCE(sqlc.narg(memory_provider_id)::uuid, bots.memory_provider_id),
image_model_id = COALESCE(sqlc.narg(image_model_id)::uuid, bots.image_model_id),
tts_model_id = COALESCE(sqlc.narg(tts_model_id)::uuid, bots.tts_model_id),
transcription_model_id = COALESCE(sqlc.narg(transcription_model_id)::uuid, bots.transcription_model_id),
browser_context_id = COALESCE(sqlc.narg(browser_context_id)::uuid, bots.browser_context_id),
persist_full_tool_results = sqlc.arg(persist_full_tool_results),
updated_at = now()
WHERE bots.id = sqlc.arg(id)
RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.transcription_model_id, bots.browser_context_id, bots.persist_full_tool_results
RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.browser_context_id, bots.persist_full_tool_results
)
SELECT
updated.id AS bot_id,
@@ -83,7 +80,6 @@ SELECT
memory_providers.id AS memory_provider_id,
image_models.id AS image_model_id,
tts_models.id AS tts_model_id,
transcription_models.id AS transcription_model_id,
browser_contexts.id AS browser_context_id,
updated.persist_full_tool_results
FROM updated
@@ -95,7 +91,6 @@ LEFT JOIN models AS image_models ON image_models.id = updated.image_model_id
LEFT JOIN search_providers ON search_providers.id = updated.search_provider_id
LEFT JOIN memory_providers ON memory_providers.id = updated.memory_provider_id
LEFT JOIN models AS tts_models ON tts_models.id = updated.tts_model_id
LEFT JOIN models AS transcription_models ON transcription_models.id = updated.transcription_model_id
LEFT JOIN browser_contexts ON browser_contexts.id = updated.browser_context_id;
-- name: DeleteSettingsByBotID :exec
@@ -117,7 +112,6 @@ SET language = 'auto',
search_provider_id = NULL,
memory_provider_id = NULL,
tts_model_id = NULL,
transcription_model_id = NULL,
browser_context_id = NULL,
persist_full_tool_results = false,
updated_at = now()
+4 -2
View File
@@ -72,7 +72,8 @@ func TestSpawnAndNotify(t *testing.T) {
task := mgr.Get(taskID)
if task == nil {
t.Fatal("task not found after completion")
} else if task.Status != TaskCompleted {
}
if task.Status != TaskCompleted {
t.Errorf("expected task status completed, got %s", task.Status)
}
}
@@ -129,7 +130,8 @@ func TestKillTask(t *testing.T) {
task := mgr.Get(taskID)
if task == nil {
t.Fatal("task not found")
} else if task.Status != TaskKilled {
}
if task.Status != TaskKilled {
t.Errorf("expected status killed, got %s", task.Status)
}
+1 -1
View File
@@ -84,7 +84,7 @@ func retryDelay(attempt int, cfg RetryConfig) time.Duration {
if backoffIdx > 20 {
backoffIdx = 20
}
delay := cfg.BaseDelay * time.Duration(1<<backoffIdx)
delay := cfg.BaseDelay * time.Duration(1<<uint(backoffIdx))
delay = min(delay, cfg.MaxDelay)
// Add jitter: random value in [0, delay/2), so final delay is in [delay/2, delay).
// math/rand is intentional here — cryptographic randomness is not needed for backoff jitter.
+3 -3
View File
@@ -295,7 +295,7 @@ func (p *ContainerProvider) execRead(ctx context.Context, session SessionContext
content += "\n"
}
content = addLineNumbers(content, lineOffset)
content = addLineNumbers(content, int32(lineOffset))
return map[string]any{"content": content, "total_lines": totalLines}, nil
}
@@ -757,7 +757,7 @@ func truncateStr(s string, n int) string {
return s[:n] + "..."
}
func addLineNumbers(content string, startLine int) string {
func addLineNumbers(content string, startLine int32) string {
if content == "" {
return content
}
@@ -765,7 +765,7 @@ func addLineNumbers(content string, startLine int) string {
var out strings.Builder
out.Grow(len(content) + len(lines)*8)
for i, line := range lines {
fmt.Fprintf(&out, "%6d\t%s\n", startLine+i, line)
fmt.Fprintf(&out, "%6d\t%s\n", int(startLine)+i, line)
}
return out.String()
}
-232
View File
@@ -1,232 +0,0 @@
//nolint:gosec
package tools
import (
"context"
"errors"
"fmt"
"io"
"log/slog"
"net"
"net/http"
"net/url"
"path/filepath"
"strings"
"time"
sdk "github.com/memohai/twilight-ai/sdk"
audiopkg "github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/media"
"github.com/memohai/memoh/internal/settings"
)
const mediaDataPrefix = "/data/media/"
type TranscriptionProvider struct {
logger *slog.Logger
settings *settings.Service
audio *audiopkg.Service
media *media.Service
http *http.Client
}
func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, mediaSvc *media.Service) *TranscriptionProvider {
if log == nil {
log = slog.Default()
}
return &TranscriptionProvider{
logger: log.With(slog.String("tool", "transcribe_audio")),
settings: settingsSvc,
audio: audioSvc,
media: mediaSvc,
http: &http.Client{
Timeout: 30 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return errors.New("stopped after 10 redirects")
}
if _, err := validateURL(req.Context(), req.URL.String()); err != nil {
return fmt.Errorf("redirect to non-public address is not allowed: %w", err)
}
return nil
},
},
}
}
func (p *TranscriptionProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
if session.IsSubagent || p.settings == nil || p.audio == nil || p.media == nil {
return nil, nil
}
botID := strings.TrimSpace(session.BotID)
if botID == "" {
return nil, nil
}
botSettings, err := p.settings.GetBot(ctx, botID)
if err != nil || strings.TrimSpace(botSettings.TranscriptionModelID) == "" {
return nil, nil
}
sess := session
return []sdk.Tool{{
Name: "transcribe_audio",
Description: "Transcribe an audio or voice message into text. Use this when the user sent a voice message and you need to understand its contents. Accepts a bot media path such as /data/media/... or a direct URL.",
Parameters: map[string]any{
"type": "object",
"properties": map[string]any{
"path": map[string]any{"type": "string", "description": "Audio file path from the message context, usually under /data/media/..."},
"url": map[string]any{"type": "string", "description": "Direct audio URL when a path is unavailable"},
"language": map[string]any{"type": "string", "description": "Optional language hint"},
"prompt": map[string]any{"type": "string", "description": "Optional transcription prompt"},
"contentType": map[string]any{"type": "string", "description": "Optional MIME type override"},
},
"required": []string{},
},
Execute: func(execCtx *sdk.ToolExecContext, input any) (any, error) {
return p.execTranscribe(execCtx.Context, sess, inputAsMap(input))
},
}}, nil
}
func (p *TranscriptionProvider) execTranscribe(ctx context.Context, session SessionContext, args map[string]any) (any, error) {
botID := strings.TrimSpace(session.BotID)
if botID == "" {
return nil, errors.New("bot_id is required")
}
botSettings, err := p.settings.GetBot(ctx, botID)
if err != nil {
return nil, errors.New("failed to load bot settings")
}
modelID := strings.TrimSpace(botSettings.TranscriptionModelID)
if modelID == "" {
return nil, errors.New("bot has no transcription model configured")
}
path := FirstStringArg(args, "path", "audio_path", "file_path")
rawURL := FirstStringArg(args, "url", "audio_url")
if path == "" && rawURL == "" {
return nil, errors.New("path or url is required")
}
audio, filename, contentType, err := p.loadAudio(ctx, botID, path, rawURL, FirstStringArg(args, "contentType", "content_type"))
if err != nil {
return nil, err
}
override := map[string]any{}
if language := FirstStringArg(args, "language"); language != "" {
override["language"] = language
}
if prompt := FirstStringArg(args, "prompt"); prompt != "" {
override["prompt"] = prompt
}
result, err := p.audio.Transcribe(ctx, modelID, audio, filename, contentType, override)
if err != nil {
return nil, err
}
return map[string]any{
"ok": true,
"text": result.Text,
"language": result.Language,
"duration_seconds": result.DurationSeconds,
}, nil
}
func (p *TranscriptionProvider) loadAudio(ctx context.Context, botID, pathValue, rawURL, contentTypeOverride string) ([]byte, string, string, error) {
if pathValue != "" {
return p.loadAudioFromPath(ctx, botID, pathValue, contentTypeOverride)
}
u, err := validateURL(ctx, rawURL)
if err != nil {
return nil, "", "", err
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
if err != nil {
return nil, "", "", err
}
resp, err := p.http.Do(req)
if err != nil {
return nil, "", "", err
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
_ = resp.Body.Close()
return nil, "", "", fmt.Errorf("download audio: unexpected status %d", resp.StatusCode)
}
defer func(body io.ReadCloser) {
if closeErr := body.Close(); closeErr != nil {
p.logger.Warn("failed to close audio response body", slog.Any("error", closeErr))
}
}(resp.Body)
audio, err := io.ReadAll(resp.Body)
if err != nil {
return nil, "", "", err
}
contentType := strings.TrimSpace(contentTypeOverride)
if contentType == "" {
contentType = strings.TrimSpace(resp.Header.Get("Content-Type"))
}
return audio, filepath.Base(strings.TrimSpace(req.URL.Path)), contentType, nil
}
func (p *TranscriptionProvider) loadAudioFromPath(ctx context.Context, botID, pathValue, contentTypeOverride string) ([]byte, string, string, error) {
storageKey := strings.TrimSpace(strings.TrimPrefix(strings.TrimSpace(pathValue), mediaDataPrefix))
if storageKey == "" || storageKey == strings.TrimSpace(pathValue) {
return nil, "", "", fmt.Errorf("unsupported media path: %s", pathValue)
}
asset, err := p.media.GetByStorageKey(ctx, botID, storageKey)
if err != nil {
return nil, "", "", err
}
reader, _, err := p.media.Open(ctx, botID, asset.ContentHash)
if err != nil {
return nil, "", "", err
}
defer func(reader io.ReadCloser) {
if closeErr := reader.Close(); closeErr != nil {
p.logger.Warn("failed to close media reader", slog.Any("error", closeErr))
}
}(reader)
audio, err := io.ReadAll(reader)
if err != nil {
return nil, "", "", err
}
contentType := strings.TrimSpace(contentTypeOverride)
if contentType == "" {
contentType = strings.TrimSpace(asset.Mime)
}
return audio, filepath.Base(storageKey), contentType, nil
}
func validateURL(ctx context.Context, rawURL string) (*url.URL, error) {
u, err := url.Parse(rawURL)
if err != nil {
return nil, fmt.Errorf("invalid url: %w", err)
}
if u.Scheme != "http" && u.Scheme != "https" {
return nil, fmt.Errorf("unsupported scheme: %s", u.Scheme)
}
hostname := u.Hostname()
if hostname == "" {
return nil, errors.New("missing hostname in url")
}
resolver := net.Resolver{}
ips, err := resolver.LookupIPAddr(ctx, hostname)
if err != nil {
return nil, fmt.Errorf("dns lookup failed for %s: %w", hostname, err)
}
if len(ips) == 0 {
return nil, fmt.Errorf("no ip addresses found for %s", hostname)
}
for _, ip := range ips {
if ip.IP.IsLoopback() || ip.IP.IsPrivate() || ip.IP.IsLinkLocalUnicast() || ip.IP.IsLinkLocalMulticast() {
return nil, fmt.Errorf("url resolves to a non-public ip address: %s", ip.IP.String())
}
}
return u, nil
}
+6 -6
View File
@@ -10,9 +10,9 @@ import (
sdk "github.com/memohai/twilight-ai/sdk"
audiopkg "github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/channel"
"github.com/memohai/memoh/internal/settings"
ttspkg "github.com/memohai/memoh/internal/tts"
)
const ttsMaxTextLen = 500
@@ -30,26 +30,26 @@ type TTSChannelResolver interface {
type TTSProvider struct {
logger *slog.Logger
settings *settings.Service
audio *audiopkg.Service
tts *ttspkg.Service
sender TTSSender
resolver TTSChannelResolver
}
func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, ttsSvc *ttspkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
if log == nil {
log = slog.Default()
}
return &TTSProvider{
logger: log.With(slog.String("tool", "tts")),
settings: settingsSvc,
audio: audioSvc,
tts: ttsSvc,
sender: sender,
resolver: resolver,
}
}
func (p *TTSProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
if session.IsSubagent || p.settings == nil || p.audio == nil || p.sender == nil || p.resolver == nil {
if session.IsSubagent || p.settings == nil || p.tts == nil || p.sender == nil || p.resolver == nil {
return nil, nil
}
botID := strings.TrimSpace(session.BotID)
@@ -115,7 +115,7 @@ func (p *TTSProvider) execSpeak(ctx context.Context, session SessionContext, arg
if botSettings.TtsModelID == "" {
return nil, errors.New("bot has no TTS model configured")
}
audioData, contentType, synthErr := p.audio.Synthesize(ctx, botSettings.TtsModelID, text, nil)
audioData, contentType, synthErr := p.tts.Synthesize(ctx, botSettings.TtsModelID, text, nil)
if synthErr != nil {
return nil, fmt.Errorf("speech synthesis failed: %s", synthErr.Error())
}
-100
View File
@@ -1,100 +0,0 @@
package audio
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgtype"
"github.com/memohai/memoh/internal/db/sqlc"
"github.com/memohai/memoh/internal/models"
)
func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
for _, def := range registry.List() {
provider, err := queries.GetProviderByClientType(ctx, string(def.ClientType))
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
if logger != nil {
logger.Warn("audio registry skipped provider without template",
slog.String("provider", string(def.ClientType)),
slog.String("display_name", def.DisplayName))
}
continue
}
if logger != nil {
logger.Warn("audio registry failed to load provider template",
slog.String("provider", string(def.ClientType)),
slog.String("display_name", def.DisplayName),
slog.Any("error", err))
}
return fmt.Errorf("get provider by client type %s: %w", def.ClientType, err)
}
synced := 0
if !isTranscriptionClientType(def.ClientType) {
for _, model := range def.Models {
if shouldHideTemplateModel(def, models.ModelTypeSpeech, model.ID) {
if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{
ProviderID: provider.ID,
ModelID: model.ID,
Type: string(models.ModelTypeSpeech),
}); err != nil {
return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err)
}
continue
}
modelConfigJSON, err := json.Marshal(map[string]any{})
if err != nil {
return fmt.Errorf("marshal speech model config: %w", err)
}
name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
ModelID: model.ID,
Name: name,
ProviderID: provider.ID,
Type: string(models.ModelTypeSpeech),
Config: modelConfigJSON,
}); err != nil {
return fmt.Errorf("upsert speech model %s: %w", model.ID, err)
}
synced++
}
}
for _, model := range def.TranscriptionModels {
if shouldHideTemplateModel(def, models.ModelTypeTranscription, model.ID) {
if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{
ProviderID: provider.ID,
ModelID: model.ID,
Type: string(models.ModelTypeTranscription),
}); err != nil {
return fmt.Errorf("delete hidden transcription template model %s: %w", model.ID, err)
}
continue
}
modelConfigJSON, err := json.Marshal(map[string]any{})
if err != nil {
return fmt.Errorf("marshal transcription model config: %w", err)
}
name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
ModelID: model.ID,
Name: name,
ProviderID: provider.ID,
Type: string(models.ModelTypeTranscription),
Config: modelConfigJSON,
}); err != nil {
return fmt.Errorf("upsert transcription model %s: %w", model.ID, err)
}
}
if logger != nil {
logger.Info("speech registry synced", slog.String("provider", string(def.ClientType)), slog.Int("models", synced))
}
}
return nil
}
-769
View File
@@ -1,769 +0,0 @@
package audio
import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"github.com/jackc/pgx/v5/pgtype"
sdk "github.com/memohai/twilight-ai/sdk"
"github.com/memohai/memoh/internal/db"
"github.com/memohai/memoh/internal/db/sqlc"
"github.com/memohai/memoh/internal/models"
)
type Service struct {
queries *sqlc.Queries
logger *slog.Logger
registry *Registry
}
func NewService(log *slog.Logger, queries *sqlc.Queries, registry *Registry) *Service {
return &Service{
queries: queries,
logger: log.With(slog.String("service", "audio")),
registry: registry,
}
}
func (s *Service) Registry() *Registry { return s.registry }
func (s *Service) ListMeta(_ context.Context) []ProviderMetaResponse {
return s.registry.ListMeta()
}
func (s *Service) ListSpeechMeta(_ context.Context) []ProviderMetaResponse {
return s.registry.ListSpeechMeta()
}
func (s *Service) ListTranscriptionMeta(_ context.Context) []ProviderMetaResponse {
return s.registry.ListTranscriptionMeta()
}
func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
rows, err := s.queries.ListSpeechProviders(ctx)
if err != nil {
return nil, fmt.Errorf("list speech providers: %w", err)
}
items := make([]SpeechProviderResponse, 0, len(rows))
for _, row := range rows {
items = append(items, toSpeechProviderResponse(row))
}
return items, nil
}
func (s *Service) ListTranscriptionProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
rows, err := s.queries.ListTranscriptionProviders(ctx)
if err != nil {
return nil, fmt.Errorf("list transcription providers: %w", err)
}
items := make([]SpeechProviderResponse, 0, len(rows))
for _, row := range rows {
items = append(items, toSpeechProviderResponse(row))
}
return items, nil
}
func (s *Service) GetSpeechProvider(ctx context.Context, id string) (SpeechProviderResponse, error) {
pgID, err := db.ParseUUID(id)
if err != nil {
return SpeechProviderResponse{}, err
}
row, err := s.queries.GetProviderByID(ctx, pgID)
if err != nil {
return SpeechProviderResponse{}, fmt.Errorf("get speech provider: %w", err)
}
return toSpeechProviderResponse(row), nil
}
func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse, error) {
rows, err := s.queries.ListSpeechModels(ctx)
if err != nil {
return nil, fmt.Errorf("list speech models: %w", err)
}
items := make([]SpeechModelResponse, 0, len(rows))
for _, row := range rows {
if s.shouldHideModel(row.ProviderType, models.ModelTypeSpeech, row.ModelID) {
continue
}
items = append(items, toSpeechModelFromListRow(row))
}
return items, nil
}
func (s *Service) ListTranscriptionModels(ctx context.Context) ([]TranscriptionModelResponse, error) {
rows, err := s.queries.ListTranscriptionModels(ctx)
if err != nil {
return nil, fmt.Errorf("list transcription models: %w", err)
}
items := make([]TranscriptionModelResponse, 0, len(rows))
for _, row := range rows {
if s.shouldHideModel(row.ProviderType, models.ModelTypeTranscription, row.ModelID) {
continue
}
items = append(items, toTranscriptionModelFromListRow(row))
}
return items, nil
}
func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID string) ([]SpeechModelResponse, error) {
pgID, err := db.ParseUUID(providerID)
if err != nil {
return nil, err
}
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
rows, err := s.queries.ListSpeechModelsByProviderID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("list speech models by provider: %w", err)
}
items := make([]SpeechModelResponse, 0, len(rows))
for _, row := range rows {
if shouldHideTemplateModel(def, models.ModelTypeSpeech, row.ModelID) {
continue
}
items = append(items, toSpeechModelFromModel(row, ""))
}
return items, nil
}
func (s *Service) ListTranscriptionModelsByProvider(ctx context.Context, providerID string) ([]TranscriptionModelResponse, error) {
pgID, err := db.ParseUUID(providerID)
if err != nil {
return nil, err
}
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
rows, err := s.queries.ListTranscriptionModelsByProviderID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("list transcription models by provider: %w", err)
}
items := make([]TranscriptionModelResponse, 0, len(rows))
for _, row := range rows {
if shouldHideTemplateModel(def, models.ModelTypeTranscription, row.ModelID) {
continue
}
items = append(items, toTranscriptionModelFromModel(row, ""))
}
return items, nil
}
func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelResponse, error) {
pgID, err := db.ParseUUID(id)
if err != nil {
return SpeechModelResponse{}, err
}
row, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
if err != nil {
return SpeechModelResponse{}, fmt.Errorf("get speech model: %w", err)
}
return toSpeechModelWithProviderResponse(row), nil
}
func (s *Service) GetTranscriptionModel(ctx context.Context, id string) (TranscriptionModelResponse, error) {
pgID, err := db.ParseUUID(id)
if err != nil {
return TranscriptionModelResponse{}, err
}
row, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
if err != nil {
return TranscriptionModelResponse{}, fmt.Errorf("get transcription model: %w", err)
}
return toTranscriptionModelWithProviderResponse(row), nil
}
func (s *Service) UpdateSpeechModel(ctx context.Context, id string, req UpdateSpeechModelRequest) (SpeechModelResponse, error) {
pgID, err := db.ParseUUID(id)
if err != nil {
return SpeechModelResponse{}, err
}
row, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
if err != nil {
return SpeechModelResponse{}, fmt.Errorf("get speech model: %w", err)
}
configJSON, err := json.Marshal(req.Config)
if err != nil {
return SpeechModelResponse{}, fmt.Errorf("marshal speech config: %w", err)
}
name := row.Name
if req.Name != nil {
name = pgtype.Text{String: *req.Name, Valid: *req.Name != ""}
}
updated, err := s.queries.UpdateModel(ctx, sqlc.UpdateModelParams{
ID: pgID,
ModelID: row.ModelID,
Name: name,
ProviderID: row.ProviderID,
Type: string(models.ModelTypeSpeech),
Config: configJSON,
})
if err != nil {
return SpeechModelResponse{}, fmt.Errorf("update speech model: %w", err)
}
return toSpeechModelFromModel(updated, row.ProviderType), nil
}
func (s *Service) UpdateTranscriptionModel(ctx context.Context, id string, req UpdateSpeechModelRequest) (TranscriptionModelResponse, error) {
pgID, err := db.ParseUUID(id)
if err != nil {
return TranscriptionModelResponse{}, err
}
row, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
if err != nil {
return TranscriptionModelResponse{}, fmt.Errorf("get transcription model: %w", err)
}
configJSON, err := json.Marshal(req.Config)
if err != nil {
return TranscriptionModelResponse{}, fmt.Errorf("marshal transcription config: %w", err)
}
name := row.Name
if req.Name != nil {
name = pgtype.Text{String: *req.Name, Valid: *req.Name != ""}
}
updated, err := s.queries.UpdateModel(ctx, sqlc.UpdateModelParams{
ID: pgID,
ModelID: row.ModelID,
Name: name,
ProviderID: row.ProviderID,
Type: string(models.ModelTypeTranscription),
Config: configJSON,
})
if err != nil {
return TranscriptionModelResponse{}, fmt.Errorf("update transcription model: %w", err)
}
return toTranscriptionModelFromModel(updated, row.ProviderType), nil
}
func (s *Service) Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error) {
params, err := s.resolveSpeechParams(ctx, modelID, text, overrideCfg)
if err != nil {
return nil, "", err
}
result, err := sdk.GenerateSpeech(ctx,
sdk.WithSpeechModel(params.model),
sdk.WithText(text),
sdk.WithSpeechConfig(params.config),
)
if err != nil {
return nil, "", fmt.Errorf("synthesize: %w", err)
}
return result.Audio, result.ContentType, nil
}
func (s *Service) StreamToFile(ctx context.Context, modelID string, text string, w io.Writer) (string, error) {
params, err := s.resolveSpeechParams(ctx, modelID, text, nil)
if err != nil {
return "", err
}
streamResult, err := sdk.StreamSpeech(ctx,
sdk.WithSpeechModel(params.model),
sdk.WithText(text),
sdk.WithSpeechConfig(params.config),
)
if err != nil {
return "", fmt.Errorf("stream: %w", err)
}
audio, err := streamResult.Bytes()
if err != nil {
return "", fmt.Errorf("stream: %w", err)
}
if _, writeErr := w.Write(audio); writeErr != nil {
return "", fmt.Errorf("write chunk: %w", writeErr)
}
return streamResult.ContentType, nil
}
func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
pgID, err := db.ParseUUID(modelID)
if err != nil {
return nil, err
}
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech model: %w", err)
}
def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
if err != nil {
return nil, err
}
template := findModelTemplate(def.Models, def.DefaultModel, modelRow.ModelID)
if template == nil {
return nil, fmt.Errorf("speech model capabilities not found: %s", modelRow.ModelID)
}
caps := template.Capabilities
if len(caps.ConfigSchema.Fields) == 0 {
caps.ConfigSchema = template.ConfigSchema
}
return &caps, nil
}
func (s *Service) GetSpeechModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
return s.GetModelCapabilities(ctx, modelID)
}
func (s *Service) GetTranscriptionModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
pgID, err := db.ParseUUID(modelID)
if err != nil {
return nil, err
}
modelRow, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get transcription model: %w", err)
}
def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
if err != nil {
return nil, err
}
template := findModelTemplate(def.TranscriptionModels, def.DefaultTranscriptionModel, modelRow.ModelID)
if template == nil {
return nil, fmt.Errorf("transcription model capabilities not found: %s", modelRow.ModelID)
}
caps := template.Capabilities
if len(caps.ConfigSchema.Fields) == 0 {
caps.ConfigSchema = template.ConfigSchema
}
return &caps, nil
}
func (s *Service) FetchRemoteModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
pgID, err := db.ParseUUID(providerID)
if err != nil {
return nil, err
}
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
if !def.SupportsList || def.Factory == nil {
return nil, fmt.Errorf("speech provider does not support model discovery: %s", providerRow.ClientType)
}
provider, err := def.Factory(parseConfig(providerRow.Config))
if err != nil {
return nil, fmt.Errorf("build speech provider: %w", err)
}
remoteModels, err := provider.ListModels(ctx)
if err != nil {
return nil, fmt.Errorf("list speech models: %w", err)
}
discovered := make([]ModelInfo, 0, len(remoteModels))
for _, remoteModel := range remoteModels {
if remoteModel == nil || remoteModel.ID == "" {
continue
}
discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.Models))
}
return discovered, nil
}
func (s *Service) FetchRemoteTranscriptionModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
pgID, err := db.ParseUUID(providerID)
if err != nil {
return nil, err
}
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
if !def.SupportsTranscriptionList || def.TranscriptionFactory == nil {
return nil, fmt.Errorf("speech provider does not support transcription model discovery: %s", providerRow.ClientType)
}
provider, err := def.TranscriptionFactory(parseConfig(providerRow.Config))
if err != nil {
return nil, fmt.Errorf("build transcription provider: %w", err)
}
remoteModels, err := provider.ListModels(ctx)
if err != nil {
return nil, fmt.Errorf("list transcription models: %w", err)
}
discovered := make([]ModelInfo, 0, len(remoteModels))
for _, remoteModel := range remoteModels {
if remoteModel == nil || remoteModel.ID == "" {
continue
}
discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.TranscriptionModels))
}
return discovered, nil
}
func (s *Service) Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (*sdk.TranscriptionResult, error) {
params, err := s.resolveTranscriptionParams(ctx, modelID, audio, filename, contentType, overrideCfg)
if err != nil {
return nil, err
}
result, err := sdk.Transcribe(ctx,
sdk.WithTranscriptionModel(params.model),
sdk.WithAudio(audio, filename, contentType),
sdk.WithTranscriptionConfig(params.config),
)
if err != nil {
return nil, fmt.Errorf("transcribe: %w", err)
}
return result, nil
}
type resolvedSpeechParams struct {
model *sdk.SpeechModel
config map[string]any
}
type resolvedTranscriptionParams struct {
model *sdk.TranscriptionModel
config map[string]any
}
func (s *Service) resolveSpeechParams(ctx context.Context, modelID string, text string, overrideCfg map[string]any) (*resolvedSpeechParams, error) {
_ = text
pgID, err := db.ParseUUID(modelID)
if err != nil {
return nil, err
}
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech model: %w", err)
}
providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
provider, err := def.Factory(parseConfig(providerRow.Config))
if err != nil {
return nil, fmt.Errorf("build speech provider: %w", err)
}
cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
return &resolvedSpeechParams{
model: &sdk.SpeechModel{ID: modelRow.ModelID, Provider: provider},
config: cfg,
}, nil
}
func (s *Service) resolveTranscriptionParams(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (*resolvedTranscriptionParams, error) {
_ = audio
_ = filename
_ = contentType
pgID, err := db.ParseUUID(modelID)
if err != nil {
return nil, err
}
modelRow, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get transcription model: %w", err)
}
providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
provider, err := def.TranscriptionFactory(parseConfig(providerRow.Config))
if err != nil {
return nil, fmt.Errorf("build transcription provider: %w", err)
}
cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
return &resolvedTranscriptionParams{
model: &sdk.TranscriptionModel{ID: modelRow.ModelID, Provider: provider},
config: cfg,
}, nil
}
func parseConfig(raw []byte) map[string]any {
if len(raw) == 0 {
return map[string]any{}
}
var cfg map[string]any
if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil {
return map[string]any{}
}
return cfg
}
func mergeConfig(parts ...map[string]any) map[string]any {
out := make(map[string]any)
for _, part := range parts {
for key, value := range part {
out[key] = value
}
}
return out
}
func mergeRemoteModelInfo(modelID string, defaults []ModelInfo) ModelInfo {
for _, model := range defaults {
if model.ID == modelID {
return model
}
}
return ModelInfo{
ID: modelID,
Name: modelID,
}
}
func (s *Service) shouldHideModel(clientType string, modelType models.ModelType, modelID string) bool {
def, err := s.registry.Get(models.ClientType(clientType))
if err != nil {
return false
}
return shouldHideTemplateModel(def, modelType, modelID)
}
func shouldHideTemplateModel(def ProviderDefinition, modelType models.ModelType, modelID string) bool {
switch modelType {
case models.ModelTypeSpeech:
if !def.SupportsList {
return false
}
for _, model := range def.Models {
if model.ID == modelID {
return model.TemplateOnly
}
}
case models.ModelTypeTranscription:
if !def.SupportsTranscriptionList {
return false
}
for _, model := range def.TranscriptionModels {
if model.ID == modelID {
return model.TemplateOnly
}
}
}
return false
}
func findModelTemplate(modelsList []ModelInfo, defaultModel string, modelID string) *ModelInfo {
for i := range modelsList {
if modelsList[i].ID == modelID {
return &modelsList[i]
}
}
if defaultModel != "" {
for i := range modelsList {
if modelsList[i].ID == defaultModel {
return &modelsList[i]
}
}
}
if len(modelsList) > 0 {
return &modelsList[0]
}
return nil
}
func toSpeechProviderResponse(row sqlc.Provider) SpeechProviderResponse {
icon := ""
if row.Icon.Valid {
icon = row.Icon.String
}
return SpeechProviderResponse{
ID: row.ID.String(),
Name: row.Name,
ClientType: row.ClientType,
Icon: icon,
Enable: row.Enable,
Config: maskSpeechProviderConfig(parseConfig(row.Config)),
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
func maskSpeechProviderConfig(cfg map[string]any) map[string]any {
if len(cfg) == 0 {
return map[string]any{}
}
out := make(map[string]any, len(cfg))
for key, value := range cfg {
if s, ok := value.(string); ok && s != "" && isSpeechSecretKey(key) {
out[key] = maskSpeechSecret(s)
continue
}
out[key] = value
}
return out
}
func isSpeechSecretKey(key string) bool {
switch key {
case "api_key", "access_key", "secret_key", "app_key":
return true
default:
return false
}
}
func maskSpeechSecret(value string) string {
if len(value) <= 8 {
return "********"
}
return value[:4] + "****" + value[len(value)-4:]
}
func toSpeechModelFromListRow(row sqlc.ListSpeechModelsRow) SpeechModelResponse {
var cfg map[string]any
if len(row.Config) > 0 {
_ = json.Unmarshal(row.Config, &cfg)
}
name := ""
if row.Name.Valid {
name = row.Name.String
}
return SpeechModelResponse{
ID: row.ID.String(),
ModelID: row.ModelID,
Name: name,
ProviderID: row.ProviderID.String(),
ProviderType: row.ProviderType,
Config: cfg,
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
func toSpeechModelFromModel(row sqlc.Model, providerType string) SpeechModelResponse {
var cfg map[string]any
if len(row.Config) > 0 {
_ = json.Unmarshal(row.Config, &cfg)
}
name := ""
if row.Name.Valid {
name = row.Name.String
}
return SpeechModelResponse{
ID: row.ID.String(),
ModelID: row.ModelID,
Name: name,
ProviderID: row.ProviderID.String(),
ProviderType: providerType,
Config: cfg,
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
func toSpeechModelWithProviderResponse(row sqlc.GetSpeechModelWithProviderRow) SpeechModelResponse {
var cfg map[string]any
if len(row.Config) > 0 {
_ = json.Unmarshal(row.Config, &cfg)
}
name := ""
if row.Name.Valid {
name = row.Name.String
}
return SpeechModelResponse{
ID: row.ID.String(),
ModelID: row.ModelID,
Name: name,
ProviderID: row.ProviderID.String(),
ProviderType: row.ProviderType,
Config: cfg,
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
func toTranscriptionModelFromListRow(row sqlc.ListTranscriptionModelsRow) TranscriptionModelResponse {
var cfg map[string]any
if len(row.Config) > 0 {
_ = json.Unmarshal(row.Config, &cfg)
}
name := ""
if row.Name.Valid {
name = row.Name.String
}
return TranscriptionModelResponse{
ID: row.ID.String(),
ModelID: row.ModelID,
Name: name,
ProviderID: row.ProviderID.String(),
ProviderType: row.ProviderType,
Config: cfg,
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
func toTranscriptionModelFromModel(row sqlc.Model, providerType string) TranscriptionModelResponse {
var cfg map[string]any
if len(row.Config) > 0 {
_ = json.Unmarshal(row.Config, &cfg)
}
name := ""
if row.Name.Valid {
name = row.Name.String
}
return TranscriptionModelResponse{
ID: row.ID.String(),
ModelID: row.ModelID,
Name: name,
ProviderID: row.ProviderID.String(),
ProviderType: providerType,
Config: cfg,
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
func toTranscriptionModelWithProviderResponse(row sqlc.GetTranscriptionModelWithProviderRow) TranscriptionModelResponse {
var cfg map[string]any
if len(row.Config) > 0 {
_ = json.Unmarshal(row.Config, &cfg)
}
name := ""
if row.Name.Valid {
name = row.Name.String
}
return TranscriptionModelResponse{
ID: row.ID.String(),
ModelID: row.ModelID,
Name: name,
ProviderID: row.ProviderID.String(),
ProviderType: row.ProviderType,
Config: cfg,
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
-102
View File
@@ -1,102 +0,0 @@
package audio
import "time"
// ProviderMetaResponse exposes adapter metadata (from the registry, not DB).
type ProviderMetaResponse struct {
Provider string `json:"provider"`
DisplayName string `json:"display_name"`
Description string `json:"description"`
ConfigSchema ConfigSchema `json:"config_schema,omitempty"`
DefaultModel string `json:"default_model,omitempty"`
Models []ModelInfo `json:"models,omitempty"`
DefaultSynthesisModel string `json:"default_synthesis_model,omitempty"`
SynthesisModels []ModelInfo `json:"synthesis_models,omitempty"`
SupportsSynthesisList bool `json:"supports_synthesis_list,omitempty"`
DefaultTranscriptionModel string `json:"default_transcription_model,omitempty"`
TranscriptionModels []ModelInfo `json:"transcription_models,omitempty"`
SupportsTranscriptionList bool `json:"supports_transcription_list,omitempty"`
}
// SpeechProviderResponse represents a speech-capable provider from the unified providers table.
type SpeechProviderResponse struct {
ID string `json:"id"`
Name string `json:"name"`
ClientType string `json:"client_type"`
Icon string `json:"icon,omitempty"`
Enable bool `json:"enable"`
Config map[string]any `json:"config,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// SpeechModelResponse represents a speech model from the unified models table.
type SpeechModelResponse struct {
ID string `json:"id"`
ModelID string `json:"model_id"`
Name string `json:"name"`
ProviderID string `json:"provider_id"`
ProviderType string `json:"provider_type,omitempty"`
Config map[string]any `json:"config,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// TranscriptionModelResponse represents a transcription model from the unified models table.
type TranscriptionModelResponse struct {
ID string `json:"id"`
ModelID string `json:"model_id"`
Name string `json:"name"`
ProviderID string `json:"provider_id"`
ProviderType string `json:"provider_type,omitempty"`
Config map[string]any `json:"config,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// UpdateSpeechProviderRequest is used for updating a speech provider.
type UpdateSpeechProviderRequest struct {
Name *string `json:"name,omitempty"`
Enable *bool `json:"enable,omitempty"`
}
// UpdateSpeechModelRequest is used for updating a speech model.
type UpdateSpeechModelRequest struct {
Name *string `json:"name,omitempty"`
Config map[string]any `json:"config,omitempty"`
}
// TestSynthesizeRequest represents a text-to-speech test request.
type TestSynthesizeRequest struct {
Text string `json:"text"`
Config map[string]any `json:"config,omitempty"`
}
// TestTranscriptionRequest represents an audio-to-text test request.
type TestTranscriptionRequest struct {
Config map[string]any `json:"config,omitempty"`
}
// TestTranscriptionResponse represents the result of a transcription test.
type TestTranscriptionResponse struct {
Text string `json:"text"`
Language string `json:"language,omitempty"`
DurationSeconds float64 `json:"duration_seconds,omitempty"`
Words []TranscriptionWord `json:"words,omitempty"`
Metadata map[string]any `json:"metadata,omitempty"`
}
// TranscriptionWord represents a single word alignment from a transcription result.
type TranscriptionWord struct {
Text string `json:"text"`
Start float64 `json:"start,omitempty"`
End float64 `json:"end,omitempty"`
SpeakerID string `json:"speaker_id,omitempty"`
}
// ImportModelsResponse represents the response for importing speech models.
type ImportModelsResponse struct {
Created int `json:"created"`
Skipped int `json:"skipped"`
Models []string `json:"models"`
}
@@ -1,4 +1,5 @@
//go:build ignore
// +build ignore
package identities_test
+34 -174
View File
@@ -58,29 +58,14 @@ type mediaIngestor interface {
channel.ContainerAttachmentIngester
}
// speechSynthesizer synthesizes text to speech audio.
type speechSynthesizer interface {
// ttsSynthesizer synthesizes text to speech audio.
type ttsSynthesizer interface {
Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
}
// speechModelResolver looks up the speech model ID configured for a bot.
type speechModelResolver interface {
ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
}
// TranscriptionResult is the minimal speech-to-text response shape needed by inbound routing.
type TranscriptionResult interface {
GetText() string
}
// transcriptionRecognizer converts inbound audio to text using a configured model.
type transcriptionRecognizer interface {
Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (TranscriptionResult, error)
}
// transcriptionModelResolver looks up the transcription model ID configured for a bot.
type transcriptionModelResolver interface {
ResolveTranscriptionModelID(ctx context.Context, botID string) (string, error)
// ttsModelResolver looks up the TTS model ID configured for a bot.
type ttsModelResolver interface {
ResolveTtsModelID(ctx context.Context, botID string) (string, error)
}
// SessionEnsurer resolves or creates an active session for a route.
@@ -101,29 +86,27 @@ type SessionResult struct {
// ChannelInboundProcessor routes channel inbound messages to the chat gateway.
type ChannelInboundProcessor struct {
runner flow.Runner
routeResolver RouteResolver
message messagepkg.Writer
mediaService mediaIngestor
reactor channelReactor
commandHandler *command.Handler
registry *channel.Registry
logger *slog.Logger
jwtSecret string
tokenTTL time.Duration
identity *IdentityResolver
policy PolicyService
dispatcher *RouteDispatcher
acl chatACL
observer channel.StreamObserver
speechService speechSynthesizer
speechModelResolver speechModelResolver
transcriber transcriptionRecognizer
sttModelResolver transcriptionModelResolver
sessionEnsurer SessionEnsurer
pipeline *pipelinepkg.Pipeline
eventStore *pipelinepkg.EventStore
discussDriver *pipelinepkg.DiscussDriver
runner flow.Runner
routeResolver RouteResolver
message messagepkg.Writer
mediaService mediaIngestor
reactor channelReactor
commandHandler *command.Handler
registry *channel.Registry
logger *slog.Logger
jwtSecret string
tokenTTL time.Duration
identity *IdentityResolver
policy PolicyService
dispatcher *RouteDispatcher
acl chatACL
observer channel.StreamObserver
ttsService ttsSynthesizer
ttsModelResolver ttsModelResolver
sessionEnsurer SessionEnsurer
pipeline *pipelinepkg.Pipeline
eventStore *pipelinepkg.EventStore
discussDriver *pipelinepkg.DiscussDriver
// activeStreams maps "botID:routeID" to a context.CancelFunc for the
// currently running agent stream. Used by /stop to abort generation
@@ -205,23 +188,14 @@ func (p *ChannelInboundProcessor) SetStreamObserver(observer channel.StreamObser
p.observer = observer
}
// SetSpeechService configures the speech synthesizer and settings reader for
// handling <speech> tag events (speech_delta) that require server-side audio synthesis.
func (p *ChannelInboundProcessor) SetSpeechService(synth speechSynthesizer, modelResolver speechModelResolver) {
// SetTtsService configures the TTS synthesizer and settings reader for handling
// <speech> tag events (speech_delta) that require server-side audio synthesis.
func (p *ChannelInboundProcessor) SetTtsService(synth ttsSynthesizer, modelResolver ttsModelResolver) {
if p == nil {
return
}
p.speechService = synth
p.speechModelResolver = modelResolver
}
// SetTranscriptionService configures speech-to-text processing for inbound audio attachments.
func (p *ChannelInboundProcessor) SetTranscriptionService(recognizer transcriptionRecognizer, modelResolver transcriptionModelResolver) {
if p == nil {
return
}
p.transcriber = recognizer
p.sttModelResolver = modelResolver
p.ttsService = synth
p.ttsModelResolver = modelResolver
}
// SetSessionEnsurer configures the session ensurer for auto-creating sessions on routes.
@@ -352,8 +326,6 @@ func (p *ChannelInboundProcessor) HandleInbound(ctx context.Context, cfg channel
}
resolvedAttachments := p.ingestInboundAttachments(ctx, cfg, msg, strings.TrimSpace(identity.BotID), msg.Message.Attachments)
msg.Message.Attachments = resolvedAttachments
hadVoiceAttachment := containsVoiceAttachment(resolvedAttachments)
attachments := mapChannelToChatAttachments(resolvedAttachments)
text = strings.TrimSpace(msg.Message.PlainText())
@@ -494,24 +466,6 @@ func (p *ChannelInboundProcessor) HandleInbound(ctx context.Context, cfg channel
}
shouldTrigger := shouldTriggerAssistantResponse(msg) || identity.ForceReply
if sessionType == sessionpkg.TypeDiscuss || shouldTrigger {
if transcript := p.transcribeInboundAttachments(ctx, strings.TrimSpace(identity.BotID), resolvedAttachments); transcript != "" {
labeledTranscript := formatInboundTranscript(transcript)
if msg.Message.Metadata == nil {
msg.Message.Metadata = make(map[string]any)
}
msg.Message.Metadata["transcript"] = transcript
if plain := strings.TrimSpace(msg.Message.PlainText()); plain == "" {
msg.Message.Text = labeledTranscript
} else if !strings.Contains(plain, transcript) {
msg.Message.Text = plain + "\n\n" + labeledTranscript
}
} else if hadVoiceAttachment && strings.TrimSpace(msg.Message.PlainText()) == "" {
msg.Message.Text = formatVoiceTranscriptionUnavailableNotice(resolvedAttachments)
}
text = strings.TrimSpace(msg.Message.PlainText())
}
if !shouldTrigger {
p.persistPassiveMessage(ctx, identity, msg, text, attachments, resolved.RouteID, sessionID, eventID)
if p.logger != nil {
@@ -1946,97 +1900,6 @@ func (p *ChannelInboundProcessor) loadInboundAttachmentPayload(
}, nil
}
func (p *ChannelInboundProcessor) transcribeInboundAttachments(ctx context.Context, botID string, attachments []channel.Attachment) string {
if p == nil || p.transcriber == nil || p.sttModelResolver == nil || p.mediaService == nil || strings.TrimSpace(botID) == "" {
return ""
}
modelID, err := p.sttModelResolver.ResolveTranscriptionModelID(ctx, botID)
if err != nil || strings.TrimSpace(modelID) == "" {
return ""
}
transcripts := make([]string, 0, len(attachments))
for _, att := range attachments {
if att.Type != channel.AttachmentAudio && att.Type != channel.AttachmentVoice {
continue
}
if strings.TrimSpace(att.ContentHash) == "" {
continue
}
reader, asset, err := p.mediaService.Open(ctx, botID, strings.TrimSpace(att.ContentHash))
if err != nil {
if p.logger != nil {
p.logger.Warn("open inbound audio for transcription failed", slog.Any("error", err), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
}
continue
}
audio, readErr := io.ReadAll(reader)
_ = reader.Close()
if readErr != nil || len(audio) == 0 {
if p.logger != nil {
p.logger.Warn("read inbound audio for transcription failed", slog.Any("error", readErr), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
}
continue
}
filename := strings.TrimSpace(att.Name)
if filename == "" {
filename = "audio" + filepath.Ext(asset.StorageKey)
}
contentType := strings.TrimSpace(att.Mime)
if contentType == "" {
contentType = strings.TrimSpace(asset.Mime)
}
result, txErr := p.transcriber.Transcribe(ctx, modelID, audio, filename, contentType, nil)
if txErr != nil {
if p.logger != nil {
p.logger.Warn("inbound attachment transcription failed", slog.Any("error", txErr), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
}
continue
}
text := strings.TrimSpace(result.GetText())
if text == "" {
continue
}
transcripts = append(transcripts, text)
}
if len(transcripts) == 0 {
return ""
}
return strings.Join(transcripts, "\n\n")
}
func formatInboundTranscript(transcript string) string {
transcript = strings.TrimSpace(transcript)
if transcript == "" {
return ""
}
return "[Voice message transcription]\n" + transcript
}
func containsVoiceAttachment(attachments []channel.Attachment) bool {
for _, att := range attachments {
if att.Type == channel.AttachmentAudio || att.Type == channel.AttachmentVoice {
return true
}
}
return false
}
func formatVoiceTranscriptionUnavailableNotice(attachments []channel.Attachment) string {
paths := make([]string, 0, len(attachments))
for _, att := range attachments {
if att.Type != channel.AttachmentAudio && att.Type != channel.AttachmentVoice {
continue
}
if ref := strings.TrimSpace(att.URL); ref != "" {
paths = append(paths, ref)
}
}
if len(paths) == 0 {
return "[User sent a voice message, but transcription is unavailable.]"
}
return "[User sent a voice message, but transcription is unavailable. Use transcribe_audio with one of these paths if needed: " + strings.Join(paths, ", ") + "]"
}
func openInboundAttachmentURL(ctx context.Context, rawURL string) (inboundAttachmentPayload, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
if err != nil {
@@ -2227,9 +2090,6 @@ func mapChannelToChatAttachments(attachments []channel.Attachment) []conversatio
}
result := make([]conversation.ChatAttachment, 0, len(attachments))
for _, att := range attachments {
if att.Type == channel.AttachmentAudio || att.Type == channel.AttachmentVoice {
continue
}
ca := conversation.ChatAttachment{
Type: string(att.Type),
PlatformKey: att.PlatformKey,
@@ -2304,13 +2164,13 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
outboundAssetRefs *[]conversation.OutboundAssetRef,
assetMu *sync.Mutex,
) {
if p.speechService == nil || p.speechModelResolver == nil {
if p.ttsService == nil || p.ttsModelResolver == nil {
if p.logger != nil {
p.logger.Warn("speech_delta received but TTS service not configured")
}
return
}
modelID, err := p.speechModelResolver.ResolveSpeechModelID(ctx, botID)
modelID, err := p.ttsModelResolver.ResolveTtsModelID(ctx, botID)
if err != nil || strings.TrimSpace(modelID) == "" {
if p.logger != nil {
p.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
@@ -2322,7 +2182,7 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
if text == "" {
continue
}
audioData, contentType, synthErr := p.speechService.Synthesize(ctx, modelID, text, nil)
audioData, contentType, synthErr := p.ttsService.Synthesize(ctx, modelID, text, nil)
if synthErr != nil {
if p.logger != nil {
p.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
+1 -1
View File
@@ -511,7 +511,7 @@ WITH updated AS (
SET display_name = $1,
updated_at = now()
WHERE bots.id = $2
RETURNING id, owner_user_id, display_name, avatar_url, timezone, is_active, status, language, reasoning_enabled, reasoning_effort, chat_model_id, search_provider_id, memory_provider_id, heartbeat_enabled, heartbeat_interval, heartbeat_prompt, heartbeat_model_id, compaction_enabled, compaction_threshold, compaction_ratio, compaction_model_id, title_model_id, image_model_id, discuss_probe_model_id, tts_model_id, transcription_model_id, browser_context_id, persist_full_tool_results, metadata, created_at, updated_at, acl_default_effect
RETURNING id, owner_user_id, display_name, avatar_url, timezone, is_active, status, language, reasoning_enabled, reasoning_effort, chat_model_id, search_provider_id, memory_provider_id, heartbeat_enabled, heartbeat_interval, heartbeat_prompt, heartbeat_model_id, compaction_enabled, compaction_threshold, compaction_ratio, compaction_model_id, title_model_id, image_model_id, discuss_probe_model_id, tts_model_id, browser_context_id, persist_full_tool_results, metadata, created_at, updated_at, acl_default_effect
)
SELECT
updated.id AS id,
-1
View File
@@ -34,7 +34,6 @@ type Bot struct {
ImageModelID pgtype.UUID `json:"image_model_id"`
DiscussProbeModelID pgtype.UUID `json:"discuss_probe_model_id"`
TtsModelID pgtype.UUID `json:"tts_model_id"`
TranscriptionModelID pgtype.UUID `json:"transcription_model_id"`
BrowserContextID pgtype.UUID `json:"browser_context_id"`
PersistFullToolResults bool `json:"persist_full_tool_results"`
Metadata []byte `json:"metadata"`
+11 -225
View File
@@ -13,7 +13,7 @@ import (
const countModels = `-- name: CountModels :one
SELECT COUNT(*) FROM models
WHERE type NOT IN ('speech', 'transcription')
WHERE type != 'speech'
`
func (q *Queries) CountModels(ctx context.Context) (int64, error) {
@@ -40,19 +40,13 @@ FROM providers
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
'openai-transcription',
'openrouter-speech',
'openrouter-transcription',
'elevenlabs-speech',
'elevenlabs-transcription',
'deepgram-speech',
'deepgram-transcription',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
'google-speech',
'google-transcription'
'microsoft-speech'
)
`
@@ -207,24 +201,6 @@ func (q *Queries) DeleteModelByModelID(ctx context.Context, modelID string) erro
return err
}
const deleteModelByProviderAndType = `-- name: DeleteModelByProviderAndType :exec
DELETE FROM models
WHERE provider_id = $1
AND model_id = $2
AND type = $3
`
type DeleteModelByProviderAndTypeParams struct {
ProviderID pgtype.UUID `json:"provider_id"`
ModelID string `json:"model_id"`
Type string `json:"type"`
}
func (q *Queries) DeleteModelByProviderAndType(ctx context.Context, arg DeleteModelByProviderAndTypeParams) error {
_, err := q.db.Exec(ctx, deleteModelByProviderAndType, arg.ProviderID, arg.ModelID, arg.Type)
return err
}
const deleteModelByProviderIDAndModelID = `-- name: DeleteModelByProviderIDAndModelID :exec
DELETE FROM models
WHERE provider_id = $1
@@ -318,27 +294,6 @@ func (q *Queries) GetModelByProviderAndModelID(ctx context.Context, arg GetModel
return i, err
}
const getProviderByClientType = `-- name: GetProviderByClientType :one
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE client_type = $1
`
func (q *Queries) GetProviderByClientType(ctx context.Context, clientType string) (Provider, error) {
row := q.db.QueryRow(ctx, getProviderByClientType, clientType)
var i Provider
err := row.Scan(
&i.ID,
&i.Name,
&i.ClientType,
&i.Icon,
&i.Enable,
&i.Config,
&i.Metadata,
&i.CreatedAt,
&i.UpdatedAt,
)
return i, err
}
const getProviderByID = `-- name: GetProviderByID :one
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE id = $1
`
@@ -420,51 +375,12 @@ func (q *Queries) GetSpeechModelWithProvider(ctx context.Context, id pgtype.UUID
return i, err
}
const getTranscriptionModelWithProvider = `-- name: GetTranscriptionModelWithProvider :one
SELECT
m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at,
p.client_type AS provider_type
FROM models m
JOIN providers p ON p.id = m.provider_id
WHERE m.id = $1
AND m.type = 'transcription'
`
type GetTranscriptionModelWithProviderRow struct {
ID pgtype.UUID `json:"id"`
ModelID string `json:"model_id"`
Name pgtype.Text `json:"name"`
ProviderID pgtype.UUID `json:"provider_id"`
Type string `json:"type"`
Config []byte `json:"config"`
CreatedAt pgtype.Timestamptz `json:"created_at"`
UpdatedAt pgtype.Timestamptz `json:"updated_at"`
ProviderType string `json:"provider_type"`
}
func (q *Queries) GetTranscriptionModelWithProvider(ctx context.Context, id pgtype.UUID) (GetTranscriptionModelWithProviderRow, error) {
row := q.db.QueryRow(ctx, getTranscriptionModelWithProvider, id)
var i GetTranscriptionModelWithProviderRow
err := row.Scan(
&i.ID,
&i.ModelID,
&i.Name,
&i.ProviderID,
&i.Type,
&i.Config,
&i.CreatedAt,
&i.UpdatedAt,
&i.ProviderType,
)
return i, err
}
const listEnabledModels = `-- name: ListEnabledModels :many
SELECT m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at
FROM models m
JOIN providers p ON m.provider_id = p.id
WHERE p.enable = true
AND m.type NOT IN ('speech', 'transcription')
AND m.type != 'speech'
ORDER BY m.created_at DESC
`
@@ -609,7 +525,7 @@ func (q *Queries) ListModelVariantsByModelUUID(ctx context.Context, modelUuid pg
const listModels = `-- name: ListModels :many
SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
WHERE type NOT IN ('speech', 'transcription')
WHERE type != 'speech'
ORDER BY created_at DESC
`
@@ -717,7 +633,7 @@ func (q *Queries) ListModelsByProviderClientType(ctx context.Context, clientType
const listModelsByProviderID = `-- name: ListModelsByProviderID :many
SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
WHERE provider_id = $1
AND type NOT IN ('speech', 'transcription')
AND type != 'speech'
ORDER BY created_at DESC
`
@@ -831,19 +747,13 @@ SELECT id, name, client_type, icon, enable, config, metadata, created_at, update
WHERE client_type NOT IN (
'edge-speech',
'openai-speech',
'openai-transcription',
'openrouter-speech',
'openrouter-transcription',
'elevenlabs-speech',
'elevenlabs-transcription',
'deepgram-speech',
'deepgram-transcription',
'minimax-speech',
'volcengine-speech',
'alibabacloud-speech',
'microsoft-speech',
'google-speech',
'google-transcription'
'microsoft-speech'
)
ORDER BY created_at DESC
`
@@ -1011,135 +921,6 @@ func (q *Queries) ListSpeechProviders(ctx context.Context) ([]Provider, error) {
return items, nil
}
const listTranscriptionModels = `-- name: ListTranscriptionModels :many
SELECT m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at,
p.client_type AS provider_type
FROM models m
JOIN providers p ON p.id = m.provider_id
WHERE m.type = 'transcription'
ORDER BY m.created_at DESC
`
type ListTranscriptionModelsRow struct {
ID pgtype.UUID `json:"id"`
ModelID string `json:"model_id"`
Name pgtype.Text `json:"name"`
ProviderID pgtype.UUID `json:"provider_id"`
Type string `json:"type"`
Config []byte `json:"config"`
CreatedAt pgtype.Timestamptz `json:"created_at"`
UpdatedAt pgtype.Timestamptz `json:"updated_at"`
ProviderType string `json:"provider_type"`
}
func (q *Queries) ListTranscriptionModels(ctx context.Context) ([]ListTranscriptionModelsRow, error) {
rows, err := q.db.Query(ctx, listTranscriptionModels)
if err != nil {
return nil, err
}
defer rows.Close()
var items []ListTranscriptionModelsRow
for rows.Next() {
var i ListTranscriptionModelsRow
if err := rows.Scan(
&i.ID,
&i.ModelID,
&i.Name,
&i.ProviderID,
&i.Type,
&i.Config,
&i.CreatedAt,
&i.UpdatedAt,
&i.ProviderType,
); err != nil {
return nil, err
}
items = append(items, i)
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}
const listTranscriptionModelsByProviderID = `-- name: ListTranscriptionModelsByProviderID :many
SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
WHERE provider_id = $1
AND type = 'transcription'
ORDER BY created_at DESC
`
func (q *Queries) ListTranscriptionModelsByProviderID(ctx context.Context, providerID pgtype.UUID) ([]Model, error) {
rows, err := q.db.Query(ctx, listTranscriptionModelsByProviderID, providerID)
if err != nil {
return nil, err
}
defer rows.Close()
var items []Model
for rows.Next() {
var i Model
if err := rows.Scan(
&i.ID,
&i.ModelID,
&i.Name,
&i.ProviderID,
&i.Type,
&i.Config,
&i.CreatedAt,
&i.UpdatedAt,
); err != nil {
return nil, err
}
items = append(items, i)
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}
const listTranscriptionProviders = `-- name: ListTranscriptionProviders :many
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers
WHERE client_type IN (
'openai-transcription',
'openrouter-transcription',
'elevenlabs-transcription',
'deepgram-transcription',
'google-transcription'
)
ORDER BY created_at DESC
`
func (q *Queries) ListTranscriptionProviders(ctx context.Context) ([]Provider, error) {
rows, err := q.db.Query(ctx, listTranscriptionProviders)
if err != nil {
return nil, err
}
defer rows.Close()
var items []Provider
for rows.Next() {
var i Provider
if err := rows.Scan(
&i.ID,
&i.Name,
&i.ClientType,
&i.Icon,
&i.Enable,
&i.Config,
&i.Metadata,
&i.CreatedAt,
&i.UpdatedAt,
); err != nil {
return nil, err
}
items = append(items, i)
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}
const updateModel = `-- name: UpdateModel :one
UPDATE models
SET
@@ -1281,6 +1062,11 @@ VALUES ($1, $2, $3, false, $4, '{}')
ON CONFLICT (name) DO UPDATE SET
icon = EXCLUDED.icon,
client_type = EXCLUDED.client_type,
config = CASE
WHEN providers.config->>'api_key' IS NOT NULL AND providers.config->>'api_key' != ''
THEN jsonb_set(EXCLUDED.config, '{api_key}', providers.config->'api_key')
ELSE EXCLUDED.config
END,
updated_at = now()
RETURNING id, name, client_type, icon, enable, config, metadata, created_at, updated_at
`
+4 -16
View File
@@ -30,7 +30,6 @@ SET language = 'auto',
search_provider_id = NULL,
memory_provider_id = NULL,
tts_model_id = NULL,
transcription_model_id = NULL,
browser_context_id = NULL,
persist_full_tool_results = false,
updated_at = now()
@@ -63,7 +62,6 @@ SELECT
memory_providers.id AS memory_provider_id,
image_models.id AS image_model_id,
tts_models.id AS tts_model_id,
transcription_models.id AS transcription_model_id,
browser_contexts.id AS browser_context_id,
bots.persist_full_tool_results
FROM bots
@@ -75,7 +73,6 @@ LEFT JOIN models AS image_models ON image_models.id = bots.image_model_id
LEFT JOIN search_providers ON search_providers.id = bots.search_provider_id
LEFT JOIN memory_providers ON memory_providers.id = bots.memory_provider_id
LEFT JOIN models AS tts_models ON tts_models.id = bots.tts_model_id
LEFT JOIN models AS transcription_models ON transcription_models.id = bots.transcription_model_id
LEFT JOIN browser_contexts ON browser_contexts.id = bots.browser_context_id
WHERE bots.id = $1
`
@@ -100,7 +97,6 @@ type GetSettingsByBotIDRow struct {
MemoryProviderID pgtype.UUID `json:"memory_provider_id"`
ImageModelID pgtype.UUID `json:"image_model_id"`
TtsModelID pgtype.UUID `json:"tts_model_id"`
TranscriptionModelID pgtype.UUID `json:"transcription_model_id"`
BrowserContextID pgtype.UUID `json:"browser_context_id"`
PersistFullToolResults bool `json:"persist_full_tool_results"`
}
@@ -128,7 +124,6 @@ func (q *Queries) GetSettingsByBotID(ctx context.Context, id pgtype.UUID) (GetSe
&i.MemoryProviderID,
&i.ImageModelID,
&i.TtsModelID,
&i.TranscriptionModelID,
&i.BrowserContextID,
&i.PersistFullToolResults,
)
@@ -156,12 +151,11 @@ WITH updated AS (
memory_provider_id = COALESCE($16::uuid, bots.memory_provider_id),
image_model_id = COALESCE($17::uuid, bots.image_model_id),
tts_model_id = COALESCE($18::uuid, bots.tts_model_id),
transcription_model_id = COALESCE($19::uuid, bots.transcription_model_id),
browser_context_id = COALESCE($20::uuid, bots.browser_context_id),
persist_full_tool_results = $21,
browser_context_id = COALESCE($19::uuid, bots.browser_context_id),
persist_full_tool_results = $20,
updated_at = now()
WHERE bots.id = $22
RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.transcription_model_id, bots.browser_context_id, bots.persist_full_tool_results
WHERE bots.id = $21
RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.browser_context_id, bots.persist_full_tool_results
)
SELECT
updated.id AS bot_id,
@@ -183,7 +177,6 @@ SELECT
memory_providers.id AS memory_provider_id,
image_models.id AS image_model_id,
tts_models.id AS tts_model_id,
transcription_models.id AS transcription_model_id,
browser_contexts.id AS browser_context_id,
updated.persist_full_tool_results
FROM updated
@@ -195,7 +188,6 @@ LEFT JOIN models AS image_models ON image_models.id = updated.image_model_id
LEFT JOIN search_providers ON search_providers.id = updated.search_provider_id
LEFT JOIN memory_providers ON memory_providers.id = updated.memory_provider_id
LEFT JOIN models AS tts_models ON tts_models.id = updated.tts_model_id
LEFT JOIN models AS transcription_models ON transcription_models.id = updated.transcription_model_id
LEFT JOIN browser_contexts ON browser_contexts.id = updated.browser_context_id
`
@@ -218,7 +210,6 @@ type UpsertBotSettingsParams struct {
MemoryProviderID pgtype.UUID `json:"memory_provider_id"`
ImageModelID pgtype.UUID `json:"image_model_id"`
TtsModelID pgtype.UUID `json:"tts_model_id"`
TranscriptionModelID pgtype.UUID `json:"transcription_model_id"`
BrowserContextID pgtype.UUID `json:"browser_context_id"`
PersistFullToolResults bool `json:"persist_full_tool_results"`
ID pgtype.UUID `json:"id"`
@@ -244,7 +235,6 @@ type UpsertBotSettingsRow struct {
MemoryProviderID pgtype.UUID `json:"memory_provider_id"`
ImageModelID pgtype.UUID `json:"image_model_id"`
TtsModelID pgtype.UUID `json:"tts_model_id"`
TranscriptionModelID pgtype.UUID `json:"transcription_model_id"`
BrowserContextID pgtype.UUID `json:"browser_context_id"`
PersistFullToolResults bool `json:"persist_full_tool_results"`
}
@@ -269,7 +259,6 @@ func (q *Queries) UpsertBotSettings(ctx context.Context, arg UpsertBotSettingsPa
arg.MemoryProviderID,
arg.ImageModelID,
arg.TtsModelID,
arg.TranscriptionModelID,
arg.BrowserContextID,
arg.PersistFullToolResults,
arg.ID,
@@ -295,7 +284,6 @@ func (q *Queries) UpsertBotSettings(ctx context.Context, arg UpsertBotSettingsPa
&i.MemoryProviderID,
&i.ImageModelID,
&i.TtsModelID,
&i.TranscriptionModelID,
&i.BrowserContextID,
&i.PersistFullToolResults,
)
+13 -13
View File
@@ -7,28 +7,28 @@ import (
"github.com/labstack/echo/v4"
audiopkg "github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/settings"
"github.com/memohai/memoh/internal/tts"
)
// BotAudioHandler handles per-bot speech synthesis requests from the agent tool.
type BotAudioHandler struct {
audioService *audiopkg.Service
// BotTtsHandler handles per-bot TTS synthesis requests from the agent tool.
type BotTtsHandler struct {
ttsService *tts.Service
settingsService *settings.Service
tempStore *audiopkg.TempStore
tempStore *tts.TempStore
logger *slog.Logger
}
func NewBotAudioHandler(log *slog.Logger, audioService *audiopkg.Service, settingsService *settings.Service, tempStore *audiopkg.TempStore) *BotAudioHandler {
return &BotAudioHandler{
audioService: audioService,
func NewBotTtsHandler(log *slog.Logger, ttsService *tts.Service, settingsService *settings.Service, tempStore *tts.TempStore) *BotTtsHandler {
return &BotTtsHandler{
ttsService: ttsService,
settingsService: settingsService,
tempStore: tempStore,
logger: log.With(slog.String("handler", "bot_audio")),
logger: log.With(slog.String("handler", "bot_tts")),
}
}
func (h *BotAudioHandler) Register(e *echo.Echo) {
func (h *BotTtsHandler) Register(e *echo.Echo) {
e.POST("/bots/:bot_id/tts/synthesize", h.Synthesize)
}
@@ -54,7 +54,7 @@ type synthesizeResponse struct {
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /bots/{bot_id}/tts/synthesize [post].
func (h *BotAudioHandler) Synthesize(c echo.Context) error {
func (h *BotTtsHandler) Synthesize(c echo.Context) error {
botID := strings.TrimSpace(c.Param("bot_id"))
if botID == "" {
return echo.NewHTTPError(http.StatusBadRequest, "bot_id is required")
@@ -88,10 +88,10 @@ func (h *BotAudioHandler) Synthesize(c echo.Context) error {
return echo.NewHTTPError(http.StatusInternalServerError, "failed to create temp file")
}
contentType, streamErr := h.audioService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
contentType, streamErr := h.ttsService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
closeErr := f.Close()
if streamErr != nil {
h.logger.Error("speech synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
h.logger.Error("tts synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
h.tempStore.Delete(tempID)
return echo.NewHTTPError(http.StatusInternalServerError, streamErr.Error())
}
+24 -24
View File
@@ -30,30 +30,30 @@ import (
messagepkg "github.com/memohai/memoh/internal/message"
)
// localSpeechSynthesizer synthesizes text to speech audio.
type localSpeechSynthesizer interface {
// localTtsSynthesizer synthesizes text to speech audio.
type localTtsSynthesizer interface {
Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
}
// localSpeechModelResolver resolves speech model IDs for bots.
type localSpeechModelResolver interface {
ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
// localTtsModelResolver resolves TTS model IDs for bots.
type localTtsModelResolver interface {
ResolveTtsModelID(ctx context.Context, botID string) (string, error)
}
// LocalChannelHandler handles local channel routes (WebUI / API) backed by bot history.
type LocalChannelHandler struct {
channelType channel.ChannelType
channelManager *channel.Manager
channelStore *channel.Store
chatService *conversation.Service
routeHub *local.RouteHub
botService *bots.Service
accountService *accounts.Service
resolver *flow.Resolver
mediaService *media.Service
speechService localSpeechSynthesizer
speechModelResolver localSpeechModelResolver
logger *slog.Logger
channelType channel.ChannelType
channelManager *channel.Manager
channelStore *channel.Store
chatService *conversation.Service
routeHub *local.RouteHub
botService *bots.Service
accountService *accounts.Service
resolver *flow.Resolver
mediaService *media.Service
ttsService localTtsSynthesizer
ttsModelResolver localTtsModelResolver
logger *slog.Logger
}
// NewLocalChannelHandler creates a local channel handler.
@@ -80,10 +80,10 @@ func (h *LocalChannelHandler) SetMediaService(svc *media.Service) {
h.mediaService = svc
}
// SetSpeechService configures speech synthesis for handling speech_delta events.
func (h *LocalChannelHandler) SetSpeechService(synth localSpeechSynthesizer, resolver localSpeechModelResolver) {
h.speechService = synth
h.speechModelResolver = resolver
// SetTtsService configures TTS synthesis for handling speech_delta events.
func (h *LocalChannelHandler) SetTtsService(synth localTtsSynthesizer, resolver localTtsModelResolver) {
h.ttsService = synth
h.ttsModelResolver = resolver
}
// Register registers the local channel routes.
@@ -719,12 +719,12 @@ func (h *LocalChannelHandler) ingestSingleAttachment(ctx context.Context, botID,
// wsSynthesizeSpeech handles speech_delta events by synthesizing audio and
// injecting attachment_delta events with the resulting voice attachments.
func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID string, original json.RawMessage) []json.RawMessage {
if h.speechService == nil || h.speechModelResolver == nil {
if h.ttsService == nil || h.ttsModelResolver == nil {
h.logger.Warn("speech_delta received but TTS service not configured")
return nil
}
modelID, err := h.speechModelResolver.ResolveSpeechModelID(ctx, botID)
modelID, err := h.ttsModelResolver.ResolveTtsModelID(ctx, botID)
if err != nil || strings.TrimSpace(modelID) == "" {
h.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
return nil
@@ -746,7 +746,7 @@ func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID stri
continue
}
audioData, contentType, synthErr := h.speechService.Synthesize(ctx, modelID, text, nil)
audioData, contentType, synthErr := h.ttsService.Synthesize(ctx, modelID, text, nil)
if synthErr != nil {
h.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
continue
+29 -327
View File
@@ -1,83 +1,55 @@
package handlers
import (
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
"mime/multipart"
"net/http"
"strings"
"github.com/labstack/echo/v4"
audiopkg "github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/models"
"github.com/memohai/memoh/internal/tts"
)
type AudioHandler struct {
service *audiopkg.Service
type SpeechHandler struct {
service *tts.Service
modelsService *models.Service
logger *slog.Logger
}
func NewAudioHandler(log *slog.Logger, service *audiopkg.Service, modelsService *models.Service) *AudioHandler {
return &AudioHandler{
func NewSpeechHandler(log *slog.Logger, service *tts.Service, modelsService *models.Service) *SpeechHandler {
return &SpeechHandler{
service: service,
modelsService: modelsService,
logger: log.With(slog.String("handler", "audio")),
logger: log.With(slog.String("handler", "speech")),
}
}
func (h *AudioHandler) Register(e *echo.Echo) {
func (h *SpeechHandler) Register(e *echo.Echo) {
pg := e.Group("/speech-providers")
pg.GET("", h.ListProviders)
pg.GET("/:id", h.GetProvider)
pg.GET("/meta", h.ListSpeechMeta)
pg.GET("/meta", h.ListMeta)
pg.GET("/:id/models", h.ListModelsByProvider)
pg.POST("/:id/import-models", h.ImportModels)
tpg := e.Group("/transcription-providers")
tpg.GET("", h.ListTranscriptionProviders)
tpg.GET("/meta", h.ListTranscriptionMeta)
tpg.GET("/:id", h.GetProvider)
tpg.GET("/:id/models", h.ListTranscriptionModelsByProvider)
tpg.POST("/:id/import-models", h.ImportTranscriptionModels)
mg := e.Group("/speech-models")
mg.GET("", h.ListModels)
mg.GET("/:id", h.GetModel)
mg.PUT("/:id", h.UpdateModel)
mg.GET("/:id/capabilities", h.GetModelCapabilities)
mg.POST("/:id/test", h.TestModel)
tg := e.Group("/transcription-models")
tg.GET("", h.ListTranscriptionModels)
tg.GET("/:id", h.GetTranscriptionModel)
tg.PUT("/:id", h.UpdateTranscriptionModel)
tg.GET("/:id/capabilities", h.GetTranscriptionModelCapabilities)
tg.POST("/:id/test", h.TestTranscriptionModel)
}
// ListMeta godoc
// @Summary List speech provider metadata
// @Description List available speech provider types with their models and capabilities
// @Tags speech-providers
// @Success 200 {array} audiopkg.ProviderMetaResponse
// @Success 200 {array} tts.ProviderMetaResponse
// @Router /speech-providers/meta [get].
func (h *AudioHandler) ListSpeechMeta(c echo.Context) error {
return c.JSON(http.StatusOK, h.service.ListSpeechMeta(c.Request().Context()))
}
// ListTranscriptionMeta godoc
// @Summary List transcription provider metadata
// @Description List available transcription provider types with their models and capabilities
// @Tags transcription-providers
// @Success 200 {array} audiopkg.ProviderMetaResponse
// @Router /transcription-providers/meta [get].
func (h *AudioHandler) ListTranscriptionMeta(c echo.Context) error {
return c.JSON(http.StatusOK, h.service.ListTranscriptionMeta(c.Request().Context()))
func (h *SpeechHandler) ListMeta(c echo.Context) error {
return c.JSON(http.StatusOK, h.service.ListMeta(c.Request().Context()))
}
// ListProviders godoc
@@ -85,10 +57,10 @@ func (h *AudioHandler) ListTranscriptionMeta(c echo.Context) error {
// @Description List providers that support speech (filtered view of unified providers table)
// @Tags speech-providers
// @Produce json
// @Success 200 {array} audiopkg.SpeechProviderResponse
// @Success 200 {array} tts.SpeechProviderResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-providers [get].
func (h *AudioHandler) ListProviders(c echo.Context) error {
func (h *SpeechHandler) ListProviders(c echo.Context) error {
items, err := h.service.ListSpeechProviders(c.Request().Context())
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -96,34 +68,17 @@ func (h *AudioHandler) ListProviders(c echo.Context) error {
return c.JSON(http.StatusOK, items)
}
// ListTranscriptionProviders godoc
// @Summary List transcription providers
// @Description List providers that support transcription (filtered view of unified providers table)
// @Tags transcription-providers
// @Produce json
// @Success 200 {array} audiopkg.SpeechProviderResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-providers [get].
func (h *AudioHandler) ListTranscriptionProviders(c echo.Context) error {
items, err := h.service.ListTranscriptionProviders(c.Request().Context())
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
}
return c.JSON(http.StatusOK, items)
}
// GetProvider godoc
// @Summary Get speech provider
// @Description Get a speech provider with masked config values
// @Tags speech-providers
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {object} audiopkg.SpeechProviderResponse
// @Success 200 {object} tts.SpeechProviderResponse
// @Failure 400 {object} ErrorResponse
// @Failure 404 {object} ErrorResponse
// @Router /speech-providers/{id} [get].
// @Router /transcription-providers/{id} [get].
func (h *AudioHandler) GetProvider(c echo.Context) error {
func (h *SpeechHandler) GetProvider(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -141,11 +96,11 @@ func (h *AudioHandler) GetProvider(c echo.Context) error {
// @Tags speech-providers
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {array} audiopkg.SpeechModelResponse
// @Success 200 {array} tts.SpeechModelResponse
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-providers/{id}/models [get].
func (h *AudioHandler) ListModelsByProvider(c echo.Context) error {
func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -164,12 +119,12 @@ func (h *AudioHandler) ListModelsByProvider(c echo.Context) error {
// @Accept json
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {object} audiopkg.ImportModelsResponse
// @Success 200 {object} tts.ImportModelsResponse
// @Failure 400 {object} ErrorResponse
// @Failure 404 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-providers/{id}/import-models [post].
func (h *AudioHandler) ImportModels(c echo.Context) error {
func (h *SpeechHandler) ImportModels(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -180,7 +135,7 @@ func (h *AudioHandler) ImportModels(c echo.Context) error {
return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote speech models: %v", err))
}
resp := audiopkg.ImportModelsResponse{
resp := tts.ImportModelsResponse{
Models: make([]string, 0, len(remoteModels)),
}
@@ -212,92 +167,15 @@ func (h *AudioHandler) ImportModels(c echo.Context) error {
return c.JSON(http.StatusOK, resp)
}
// ListTranscriptionModelsByProvider godoc
// @Summary List transcription models by provider
// @Description List models of type 'transcription' for a specific transcription provider
// @Tags transcription-providers
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {array} audiopkg.TranscriptionModelResponse
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-providers/{id}/models [get].
func (h *AudioHandler) ListTranscriptionModelsByProvider(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
items, err := h.service.ListTranscriptionModelsByProvider(c.Request().Context(), id)
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
}
return c.JSON(http.StatusOK, items)
}
// ImportTranscriptionModels godoc
// @Summary Import transcription models from provider
// @Description Fetch models using the configured transcription provider and import them into the unified models table
// @Tags transcription-providers
// @Accept json
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {object} audiopkg.ImportModelsResponse
// @Failure 400 {object} ErrorResponse
// @Failure 404 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-providers/{id}/import-models [post].
func (h *AudioHandler) ImportTranscriptionModels(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
remoteModels, err := h.service.FetchRemoteTranscriptionModels(c.Request().Context(), id)
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote transcription models: %v", err))
}
resp := audiopkg.ImportModelsResponse{
Models: make([]string, 0, len(remoteModels)),
}
for _, model := range remoteModels {
name := strings.TrimSpace(model.Name)
if name == "" {
name = model.ID
}
_, err := h.modelsService.Create(c.Request().Context(), models.AddRequest{
ModelID: model.ID,
Name: name,
ProviderID: id,
Type: models.ModelTypeTranscription,
Config: models.ModelConfig{},
})
if err != nil {
if errors.Is(err, models.ErrModelIDAlreadyExists) {
resp.Skipped++
continue
}
h.logger.Warn("failed to import transcription model", slog.String("model_id", model.ID), slog.Any("error", err))
continue
}
resp.Created++
resp.Models = append(resp.Models, model.ID)
}
return c.JSON(http.StatusOK, resp)
}
// ListModels godoc
// @Summary List all speech models
// @Description List all models of type 'speech' (filtered view of unified models table)
// @Tags speech-models
// @Produce json
// @Success 200 {array} audiopkg.SpeechModelResponse
// @Success 200 {array} tts.SpeechModelResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-models [get].
func (h *AudioHandler) ListModels(c echo.Context) error {
func (h *SpeechHandler) ListModels(c echo.Context) error {
items, err := h.service.ListSpeechModels(c.Request().Context())
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -305,31 +183,15 @@ func (h *AudioHandler) ListModels(c echo.Context) error {
return c.JSON(http.StatusOK, items)
}
// ListTranscriptionModels godoc
// @Summary List all transcription models
// @Description List all models of type 'transcription' (filtered view of unified models table)
// @Tags transcription-models
// @Produce json
// @Success 200 {array} audiopkg.TranscriptionModelResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-models [get].
func (h *AudioHandler) ListTranscriptionModels(c echo.Context) error {
items, err := h.service.ListTranscriptionModels(c.Request().Context())
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
}
return c.JSON(http.StatusOK, items)
}
// GetModel godoc
// @Summary Get a speech model
// @Tags speech-models
// @Produce json
// @Param id path string true "Model ID"
// @Success 200 {object} audiopkg.SpeechModelResponse
// @Success 200 {object} tts.SpeechModelResponse
// @Failure 404 {object} ErrorResponse
// @Router /speech-models/{id} [get].
func (h *AudioHandler) GetModel(c echo.Context) error {
func (h *SpeechHandler) GetModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -341,89 +203,15 @@ func (h *AudioHandler) GetModel(c echo.Context) error {
return c.JSON(http.StatusOK, resp)
}
// UpdateModel godoc
// @Summary Update a speech model
// @Tags speech-models
// @Accept json
// @Produce json
// @Param id path string true "Model ID"
// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
// @Success 200 {object} audiopkg.SpeechModelResponse
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-models/{id} [put].
func (h *AudioHandler) UpdateModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
var req audiopkg.UpdateSpeechModelRequest
if err := c.Bind(&req); err != nil {
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
}
resp, err := h.service.UpdateSpeechModel(c.Request().Context(), id, req)
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
}
return c.JSON(http.StatusOK, resp)
}
// GetTranscriptionModel godoc
// @Summary Get a transcription model
// @Tags transcription-models
// @Produce json
// @Param id path string true "Model ID"
// @Success 200 {object} audiopkg.TranscriptionModelResponse
// @Failure 404 {object} ErrorResponse
// @Router /transcription-models/{id} [get].
func (h *AudioHandler) GetTranscriptionModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
resp, err := h.service.GetTranscriptionModel(c.Request().Context(), id)
if err != nil {
return echo.NewHTTPError(http.StatusNotFound, err.Error())
}
return c.JSON(http.StatusOK, resp)
}
// UpdateTranscriptionModel godoc
// @Summary Update a transcription model
// @Tags transcription-models
// @Accept json
// @Produce json
// @Param id path string true "Model ID"
// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
// @Success 200 {object} audiopkg.TranscriptionModelResponse
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-models/{id} [put].
func (h *AudioHandler) UpdateTranscriptionModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
var req audiopkg.UpdateSpeechModelRequest
if err := c.Bind(&req); err != nil {
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
}
resp, err := h.service.UpdateTranscriptionModel(c.Request().Context(), id, req)
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
}
return c.JSON(http.StatusOK, resp)
}
// GetModelCapabilities godoc
// @Summary Get speech model capabilities
// @Tags speech-models
// @Produce json
// @Param id path string true "Model ID"
// @Success 200 {object} audiopkg.ModelCapabilities
// @Success 200 {object} tts.ModelCapabilities
// @Failure 404 {object} ErrorResponse
// @Router /speech-models/{id}/capabilities [get].
func (h *AudioHandler) GetModelCapabilities(c echo.Context) error {
func (h *SpeechHandler) GetModelCapabilities(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -435,26 +223,6 @@ func (h *AudioHandler) GetModelCapabilities(c echo.Context) error {
return c.JSON(http.StatusOK, caps)
}
// GetTranscriptionModelCapabilities godoc
// @Summary Get transcription model capabilities
// @Tags transcription-models
// @Produce json
// @Param id path string true "Model ID"
// @Success 200 {object} audiopkg.ModelCapabilities
// @Failure 404 {object} ErrorResponse
// @Router /transcription-models/{id}/capabilities [get].
func (h *AudioHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
caps, err := h.service.GetTranscriptionModelCapabilities(c.Request().Context(), id)
if err != nil {
return echo.NewHTTPError(http.StatusNotFound, err.Error())
}
return c.JSON(http.StatusOK, caps)
}
// TestModel godoc
// @Summary Test speech model synthesis
// @Description Synthesize text using a specific model's config and return audio
@@ -462,17 +230,17 @@ func (h *AudioHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
// @Accept json
// @Produce application/octet-stream
// @Param id path string true "Model ID"
// @Param request body audiopkg.TestSynthesizeRequest true "Text to synthesize"
// @Param request body tts.TestSynthesizeRequest true "Text to synthesize"
// @Success 200 {file} binary "Audio data"
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-models/{id}/test [post].
func (h *AudioHandler) TestModel(c echo.Context) error {
func (h *SpeechHandler) TestModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
var req audiopkg.TestSynthesizeRequest
var req tts.TestSynthesizeRequest
if err := c.Bind(&req); err != nil {
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
}
@@ -490,69 +258,3 @@ func (h *AudioHandler) TestModel(c echo.Context) error {
}
return c.Blob(http.StatusOK, contentType, audio)
}
// TestTranscriptionModel godoc
// @Summary Test transcription model recognition
// @Description Transcribe uploaded audio using a specific model's config and return structured text output
// @Tags transcription-models
// @Accept mpfd
// @Produce json
// @Param id path string true "Model ID"
// @Param file formData file true "Audio file"
// @Param config formData string false "Optional JSON config"
// @Success 200 {object} audiopkg.TestTranscriptionResponse
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-models/{id}/test [post].
func (h *AudioHandler) TestTranscriptionModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
file, err := c.FormFile("file")
if err != nil {
return echo.NewHTTPError(http.StatusBadRequest, "file is required")
}
src, err := file.Open()
if err != nil {
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
}
defer func(src multipart.File) {
err := src.Close()
if err != nil {
h.logger.Warn("failed to close uploaded file", slog.Any("error", err))
}
}(src)
audio, err := io.ReadAll(src)
if err != nil {
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
}
var cfg map[string]any
if raw := strings.TrimSpace(c.FormValue("config")); raw != "" {
if err := json.Unmarshal([]byte(raw), &cfg); err != nil {
return echo.NewHTTPError(http.StatusBadRequest, "invalid config")
}
}
result, err := h.service.Transcribe(c.Request().Context(), id, audio, file.Filename, file.Header.Get("Content-Type"), cfg)
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
}
resp := audiopkg.TestTranscriptionResponse{
Text: result.Text,
Language: result.Language,
DurationSeconds: result.DurationSeconds,
Metadata: result.ProviderMetadata,
}
if len(result.Words) > 0 {
resp.Words = make([]audiopkg.TranscriptionWord, 0, len(result.Words))
for _, word := range result.Words {
resp.Words = append(resp.Words, audiopkg.TranscriptionWord{
Text: word.Text,
Start: word.Start,
End: word.End,
SpeakerID: word.SpeakerID,
})
}
}
return c.JSON(http.StatusOK, resp)
}
+7 -15
View File
@@ -126,9 +126,9 @@ func (s *Service) List(ctx context.Context) ([]GetResponse, error) {
return s.convertToGetResponseList(dbModels), nil
}
// ListByType returns models filtered by type.
// ListByType returns models filtered by type (chat, embedding, or speech).
func (s *Service) ListByType(ctx context.Context, modelType ModelType) ([]GetResponse, error) {
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
return nil, fmt.Errorf("invalid model type: %s", modelType)
}
@@ -165,7 +165,7 @@ func (s *Service) ListEnabled(ctx context.Context) ([]GetResponse, error) {
// ListEnabledByType returns models from enabled providers filtered by type.
func (s *Service) ListEnabledByType(ctx context.Context, modelType ModelType) ([]GetResponse, error) {
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
return nil, fmt.Errorf("invalid model type: %s", modelType)
}
dbModels, err := s.queries.ListEnabledModelsByType(ctx, string(modelType))
@@ -206,7 +206,7 @@ func (s *Service) ListByProviderID(ctx context.Context, providerID string) ([]Ge
// ListByProviderIDAndType returns models filtered by provider ID and type.
func (s *Service) ListByProviderIDAndType(ctx context.Context, providerID string, modelType ModelType) ([]GetResponse, error) {
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
return nil, fmt.Errorf("invalid model type: %s", modelType)
}
if strings.TrimSpace(providerID) == "" {
@@ -361,7 +361,7 @@ func (s *Service) Count(ctx context.Context) (int64, error) {
// CountByType returns the number of models of a specific type.
func (s *Service) CountByType(ctx context.Context, modelType ModelType) (int64, error) {
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
return 0, fmt.Errorf("invalid model type: %s", modelType)
}
@@ -432,19 +432,13 @@ func IsValidClientType(clientType ClientType) bool {
ClientTypeGitHubCopilot,
ClientTypeEdgeSpeech,
ClientTypeOpenAISpeech,
ClientTypeOpenAITranscription,
ClientTypeOpenRouterSpeech,
ClientTypeOpenRouterTranscription,
ClientTypeElevenLabsSpeech,
ClientTypeElevenLabsTranscription,
ClientTypeDeepgramSpeech,
ClientTypeDeepgramTranscription,
ClientTypeMiniMaxSpeech,
ClientTypeVolcengineSpeech,
ClientTypeAlibabaSpeech,
ClientTypeMicrosoftSpeech,
ClientTypeGoogleSpeech,
ClientTypeGoogleTranscription:
ClientTypeMicrosoftSpeech:
return true
default:
return false
@@ -454,9 +448,7 @@ func IsValidClientType(clientType ClientType) bool {
// IsLLMClientType returns true if the client type belongs to the LLM domain
// (chat/embedding), excluding speech-only types (any type ending in "-speech").
func IsLLMClientType(clientType ClientType) bool {
return IsValidClientType(clientType) &&
!strings.HasSuffix(string(clientType), "-speech") &&
!strings.HasSuffix(string(clientType), "-transcription")
return IsValidClientType(clientType) && !strings.HasSuffix(string(clientType), "-speech")
}
// SelectMemoryModel selects a chat model for memory operations.
+19 -26
View File
@@ -9,36 +9,29 @@ import (
type ModelType string
const (
ModelTypeChat ModelType = "chat"
ModelTypeEmbedding ModelType = "embedding"
ModelTypeSpeech ModelType = "speech"
ModelTypeTranscription ModelType = "transcription"
ModelTypeChat ModelType = "chat"
ModelTypeEmbedding ModelType = "embedding"
ModelTypeSpeech ModelType = "speech"
)
type ClientType string
const (
ClientTypeOpenAIResponses ClientType = "openai-responses"
ClientTypeOpenAICompletions ClientType = "openai-completions"
ClientTypeAnthropicMessages ClientType = "anthropic-messages"
ClientTypeGoogleGenerativeAI ClientType = "google-generative-ai"
ClientTypeOpenAICodex ClientType = "openai-codex"
ClientTypeGitHubCopilot ClientType = "github-copilot"
ClientTypeEdgeSpeech ClientType = "edge-speech"
ClientTypeOpenAISpeech ClientType = "openai-speech"
ClientTypeOpenAITranscription ClientType = "openai-transcription"
ClientTypeOpenRouterSpeech ClientType = "openrouter-speech"
ClientTypeOpenRouterTranscription ClientType = "openrouter-transcription"
ClientTypeElevenLabsSpeech ClientType = "elevenlabs-speech"
ClientTypeElevenLabsTranscription ClientType = "elevenlabs-transcription"
ClientTypeDeepgramSpeech ClientType = "deepgram-speech"
ClientTypeDeepgramTranscription ClientType = "deepgram-transcription"
ClientTypeMiniMaxSpeech ClientType = "minimax-speech"
ClientTypeVolcengineSpeech ClientType = "volcengine-speech"
ClientTypeAlibabaSpeech ClientType = "alibabacloud-speech"
ClientTypeMicrosoftSpeech ClientType = "microsoft-speech"
ClientTypeGoogleSpeech ClientType = "google-speech"
ClientTypeGoogleTranscription ClientType = "google-transcription"
ClientTypeOpenAIResponses ClientType = "openai-responses"
ClientTypeOpenAICompletions ClientType = "openai-completions"
ClientTypeAnthropicMessages ClientType = "anthropic-messages"
ClientTypeGoogleGenerativeAI ClientType = "google-generative-ai"
ClientTypeOpenAICodex ClientType = "openai-codex"
ClientTypeGitHubCopilot ClientType = "github-copilot"
ClientTypeEdgeSpeech ClientType = "edge-speech"
ClientTypeOpenAISpeech ClientType = "openai-speech"
ClientTypeOpenRouterSpeech ClientType = "openrouter-speech"
ClientTypeElevenLabsSpeech ClientType = "elevenlabs-speech"
ClientTypeDeepgramSpeech ClientType = "deepgram-speech"
ClientTypeMiniMaxSpeech ClientType = "minimax-speech"
ClientTypeVolcengineSpeech ClientType = "volcengine-speech"
ClientTypeAlibabaSpeech ClientType = "alibabacloud-speech"
ClientTypeMicrosoftSpeech ClientType = "microsoft-speech"
)
const (
@@ -95,7 +88,7 @@ func (m *Model) Validate() error {
if _, err := uuid.Parse(m.ProviderID); err != nil {
return errors.New("provider ID must be a valid UUID")
}
if m.Type != ModelTypeChat && m.Type != ModelTypeEmbedding && m.Type != ModelTypeSpeech && m.Type != ModelTypeTranscription {
if m.Type != ModelTypeChat && m.Type != ModelTypeEmbedding && m.Type != ModelTypeSpeech {
return errors.New("invalid model type")
}
if m.Type == ModelTypeEmbedding {
-15
View File
@@ -175,14 +175,6 @@ func (s *Service) UpsertBot(ctx context.Context, botID string, req UpsertRequest
}
ttsModelUUID = modelID
}
transcriptionModelUUID := pgtype.UUID{}
if value := strings.TrimSpace(req.TranscriptionModelID); value != "" {
modelID, err := db.ParseUUID(value)
if err != nil {
return Settings{}, err
}
transcriptionModelUUID = modelID
}
browserContextUUID := pgtype.UUID{}
if value := strings.TrimSpace(req.BrowserContextID); value != "" {
ctxID, err := db.ParseUUID(value)
@@ -212,7 +204,6 @@ func (s *Service) UpsertBot(ctx context.Context, botID string, req UpsertRequest
SearchProviderID: searchProviderUUID,
MemoryProviderID: memoryProviderUUID,
TtsModelID: ttsModelUUID,
TranscriptionModelID: transcriptionModelUUID,
BrowserContextID: browserContextUUID,
PersistFullToolResults: current.PersistFullToolResults,
})
@@ -307,7 +298,6 @@ func normalizeBotSettingsReadRow(row sqlc.GetSettingsByBotIDRow) Settings {
row.SearchProviderID,
row.MemoryProviderID,
row.TtsModelID,
row.TranscriptionModelID,
row.BrowserContextID,
row.PersistFullToolResults,
)
@@ -332,7 +322,6 @@ func normalizeBotSettingsWriteRow(row sqlc.UpsertBotSettingsRow) Settings {
row.SearchProviderID,
row.MemoryProviderID,
row.TtsModelID,
row.TranscriptionModelID,
row.BrowserContextID,
row.PersistFullToolResults,
)
@@ -356,7 +345,6 @@ func normalizeBotSettingsFields(
searchProviderID pgtype.UUID,
memoryProviderID pgtype.UUID,
ttsModelID pgtype.UUID,
transcriptionModelID pgtype.UUID,
browserContextID pgtype.UUID,
persistFullToolResults bool,
) Settings {
@@ -388,9 +376,6 @@ func normalizeBotSettingsFields(
if ttsModelID.Valid {
settings.TtsModelID = uuid.UUID(ttsModelID.Bytes).String()
}
if transcriptionModelID.Valid {
settings.TranscriptionModelID = uuid.UUID(transcriptionModelID.Bytes).String()
}
if browserContextID.Valid {
settings.BrowserContextID = uuid.UUID(browserContextID.Bytes).String()
}
-2
View File
@@ -12,7 +12,6 @@ type Settings struct {
SearchProviderID string `json:"search_provider_id"`
MemoryProviderID string `json:"memory_provider_id"`
TtsModelID string `json:"tts_model_id"`
TranscriptionModelID string `json:"transcription_model_id"`
BrowserContextID string `json:"browser_context_id"`
Language string `json:"language"`
AclDefaultEffect string `json:"acl_default_effect"`
@@ -37,7 +36,6 @@ type UpsertRequest struct {
SearchProviderID string `json:"search_provider_id,omitempty"`
MemoryProviderID string `json:"memory_provider_id,omitempty"`
TtsModelID string `json:"tts_model_id,omitempty"`
TranscriptionModelID string `json:"transcription_model_id,omitempty"`
BrowserContextID string `json:"browser_context_id,omitempty"`
Language string `json:"language,omitempty"`
AclDefaultEffect string `json:"acl_default_effect,omitempty"`
@@ -1,4 +1,4 @@
package audio
package tts
import "context"
@@ -6,10 +6,10 @@ import (
"log/slog"
"strings"
"github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/tts"
)
const TtsTypeEdge audio.TtsType = "edge"
const TtsTypeEdge tts.TtsType = "edge"
const edgeModelReadAloud = "edge-read-aloud"
@@ -33,12 +33,12 @@ func NewEdgeAdapterWithClient(log *slog.Logger, client *EdgeWsClient) *EdgeAdapt
}
}
func (*EdgeAdapter) Type() audio.TtsType {
func (*EdgeAdapter) Type() tts.TtsType {
return TtsTypeEdge
}
func (*EdgeAdapter) Meta() audio.TtsMeta {
return audio.TtsMeta{
func (*EdgeAdapter) Meta() tts.TtsMeta {
return tts.TtsMeta{
Provider: "Microsoft Edge",
Description: "Microsoft Edge TTS",
}
@@ -54,32 +54,32 @@ var edgeFormats = []string{
"webm-24khz-16bit-mono-opus",
}
var edgeSpeedConstraint = &audio.ParamConstraint{
var edgeSpeedConstraint = &tts.ParamConstraint{
Options: []float64{0.5, 1.0, 2.0, 3.0},
Default: 1.0,
}
var edgePitchConstraint = &audio.ParamConstraint{
var edgePitchConstraint = &tts.ParamConstraint{
Min: -100,
Max: 100,
Default: 0,
}
func (*EdgeAdapter) Models() []audio.ModelInfo {
var voices []audio.VoiceInfo
func (*EdgeAdapter) Models() []tts.ModelInfo {
var voices []tts.VoiceInfo
for lang, ids := range EdgeTTSVoices {
for _, id := range ids {
name := strings.TrimPrefix(id, lang+"-")
name = strings.TrimSuffix(name, "Neural")
voices = append(voices, audio.VoiceInfo{ID: id, Lang: lang, Name: name})
voices = append(voices, tts.VoiceInfo{ID: id, Lang: lang, Name: name})
}
}
return []audio.ModelInfo{
return []tts.ModelInfo{
{
ID: edgeModelReadAloud,
Name: "Edge Read Aloud",
Description: "Built-in Edge Read Aloud speech model",
Capabilities: audio.ModelCapabilities{
Capabilities: tts.ModelCapabilities{
Voices: voices,
Formats: edgeFormats,
Speed: edgeSpeedConstraint,
@@ -100,14 +100,14 @@ func (*EdgeAdapter) ResolveModel(model string) (string, error) {
return edgeModelReadAloud, nil
}
func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config audio.AudioConfig) ([]byte, error) {
func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config tts.AudioConfig) ([]byte, error) {
if err := config.Validate(); err != nil {
return nil, fmt.Errorf("edge tts: invalid config: %w", err)
}
return a.client.Synthesize(ctx, text, config)
}
func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config audio.AudioConfig) (chan []byte, chan error) {
func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config tts.AudioConfig) (chan []byte, chan error) {
if err := config.Validate(); err != nil {
errCh := make(chan error, 1)
errCh <- fmt.Errorf("edge tts: invalid config: %w", err)
@@ -8,7 +8,7 @@ import (
"strings"
"testing"
"github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/tts"
)
func TestEdgeAdapter_TypeAndMeta(t *testing.T) {
@@ -37,7 +37,7 @@ func TestEdgeAdapter_Synthesize_WithMockServer(t *testing.T) {
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
ctx := context.Background()
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
audio, err := adapter.Synthesize(ctx, "Hello", edgeModelReadAloud, config)
if err != nil {
t.Fatalf("Synthesize: %v", err)
@@ -61,7 +61,7 @@ func TestEdgeAdapter_Stream_WithMockServer(t *testing.T) {
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
ctx := context.Background()
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
ch, errCh := adapter.Stream(ctx, "Hi", edgeModelReadAloud, config)
var chunks [][]byte
for b := range ch {
@@ -86,7 +86,7 @@ func TestEdgeAdapter_Synthesize_NotConnected(t *testing.T) {
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
ctx := context.Background()
_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, audio.AudioConfig{})
_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, tts.AudioConfig{})
if err == nil {
t.Fatal("expected error when connection fails")
}
@@ -20,7 +20,7 @@ import (
"github.com/google/uuid"
"github.com/gorilla/websocket"
"github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/tts"
)
// Edge TTS WebSocket client.
@@ -184,7 +184,7 @@ func (c *EdgeWsClient) sendFrame(path, contentType, body string, extraHeaders ma
}
// Configure sends the speech.config message (output format, etc.).
func (c *EdgeWsClient) Configure(ctx context.Context, config audio.AudioConfig) error {
func (c *EdgeWsClient) Configure(ctx context.Context, config tts.AudioConfig) error {
c.mu.Lock()
defer c.mu.Unlock()
if c.conn == nil {
@@ -207,7 +207,7 @@ func (c *EdgeWsClient) Configure(ctx context.Context, config audio.AudioConfig)
}
// buildSSML builds SSML with rate and pitch for Edge TTS prosody.
func buildSSML(text string, voice audio.VoiceConfig, speed, pitch float64) string {
func buildSSML(text string, voice tts.VoiceConfig, speed, pitch float64) string {
voiceID := voice.ID
if voiceID == "" {
voiceID = DEFAULT_VOICE
@@ -241,7 +241,7 @@ func escapeSSML(s string) string {
// Synthesize sends SSML and synchronously collects all audio data.
// It handles the full lifecycle: connect → configure → send → receive → close.
func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config audio.AudioConfig) ([]byte, error) {
func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config tts.AudioConfig) ([]byte, error) {
if err := c.Connect(ctx); err != nil {
return nil, err
}
@@ -338,7 +338,7 @@ func parseAudioChunk(data []byte) ([]byte, error) {
// Stream sends SSML and returns audio chunks via channel.
// It handles the full lifecycle: connect → configure → send → stream → close.
func (c *EdgeWsClient) Stream(ctx context.Context, text string, config audio.AudioConfig) (ch chan []byte, errCh chan error) {
func (c *EdgeWsClient) Stream(ctx context.Context, text string, config tts.AudioConfig) (ch chan []byte, errCh chan error) {
ch = make(chan []byte, 8)
errCh = make(chan error, 1)
go func() {
@@ -9,7 +9,7 @@ import (
"testing"
"time"
"github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/tts"
)
// Real Edge TTS integration tests. Not compiled by default (requires -tags=integration).
@@ -17,14 +17,14 @@ import (
//
// Run:
//
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS -v
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS -v
func TestRealEdgeTTS_Synthesize(t *testing.T) {
client := NewEdgeWsClient()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
audio, err := client.Synthesize(ctx, "Hello, this is a real Edge TTS test.", config)
if err != nil {
t.Fatalf("Synthesize: %v", err)
@@ -40,7 +40,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
ch, errCh := client.Stream(ctx, "你好,这是流式测试。", config)
var total int
for b := range ch {
@@ -57,7 +57,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {
// TestRealEdgeTTS_Formats tries every candidate format and reports which ones are supported.
//
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_Formats -v
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_Formats -v
func TestRealEdgeTTS_Formats(t *testing.T) {
formats := []string{
"audio-24khz-48kbitrate-mono-mp3",
@@ -71,8 +71,8 @@ func TestRealEdgeTTS_Formats(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
config := audio.AudioConfig{
Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
config := tts.AudioConfig{
Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
Format: fmt,
Speed: 1.0,
}
@@ -88,7 +88,7 @@ func TestRealEdgeTTS_Formats(t *testing.T) {
// TestRealEdgeTTS_SaveAudio synthesizes speech and writes the result to a file for manual inspection.
//
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
func TestRealEdgeTTS_SaveAudio(t *testing.T) {
client := NewEdgeWsClient()
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
@@ -97,11 +97,11 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {
cases := []struct {
name string
text string
voice audio.VoiceConfig
voice tts.VoiceConfig
file string
}{
{"en", "Hello, this is an Edge TTS audio save test.", audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
{"zh", "你好,这是一段中文语音合成测试。", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
{"en", "Hello, this is an Edge TTS audio save test.", tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
{"zh", "你好,这是一段中文语音合成测试。", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
}
outDir := filepath.Join(os.TempDir(), "edge_tts_test")
@@ -111,7 +111,7 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
config := audio.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
config := tts.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
audio, err := client.Synthesize(ctx, tc.text, config)
if err != nil {
t.Fatalf("Synthesize: %v", err)
@@ -11,7 +11,7 @@ import (
"github.com/gorilla/websocket"
"github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/tts"
)
var upgrader = websocket.Upgrader{
@@ -95,7 +95,7 @@ func TestEdgeWsClient_ConnectAndSynthesize(t *testing.T) {
client := NewEdgeWsClient()
client.BaseURL = wsURL
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
audio, err := client.Synthesize(t.Context(), "Hello world", config)
if err != nil {
t.Fatalf("Synthesize: %v", err)
@@ -114,7 +114,7 @@ func TestEdgeWsClient_Stream(t *testing.T) {
client := NewEdgeWsClient()
client.BaseURL = wsURL
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
ch, errCh := client.Stream(t.Context(), "Hi", config)
var chunks [][]byte
for b := range ch {
@@ -197,7 +197,7 @@ func TestParseAudioChunk_EmptyOrShort(t *testing.T) {
func TestBuildSSML(t *testing.T) {
t.Parallel()
ssml := buildSSML("Hello", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
ssml := buildSSML("Hello", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
if !strings.Contains(ssml, "zh-CN-XiaoxiaoNeural") {
t.Errorf("ssml should contain voice: %s", ssml)
}
+68
View File
@@ -0,0 +1,68 @@
package tts
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"github.com/jackc/pgx/v5/pgtype"
"github.com/memohai/memoh/internal/db/sqlc"
)
func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
for _, def := range registry.List() {
configJSON, err := json.Marshal(map[string]any{})
if err != nil {
return fmt.Errorf("marshal speech provider config: %w", err)
}
var icon pgtype.Text
if def.Icon != "" {
icon = pgtype.Text{String: def.Icon, Valid: true}
}
provider, err := queries.UpsertRegistryProvider(ctx, sqlc.UpsertRegistryProviderParams{
Name: def.DisplayName,
ClientType: string(def.ClientType),
Icon: icon,
Config: configJSON,
})
if err != nil {
return fmt.Errorf("upsert speech provider %s: %w", def.ClientType, err)
}
synced := 0
for _, model := range def.Models {
if shouldHideTemplateModel(def, model.ID) {
if err := queries.DeleteModelByProviderIDAndModelID(ctx, sqlc.DeleteModelByProviderIDAndModelIDParams{
ProviderID: provider.ID,
ModelID: model.ID,
}); err != nil {
return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err)
}
continue
}
modelConfigJSON, err := json.Marshal(map[string]any{})
if err != nil {
return fmt.Errorf("marshal speech model config: %w", err)
}
name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
ModelID: model.ID,
Name: name,
ProviderID: provider.ID,
Type: "speech",
Config: modelConfigJSON,
}); err != nil {
return fmt.Errorf("upsert speech model %s: %w", model.ID, err)
}
synced++
}
if logger != nil {
logger.Info("speech registry synced", slog.String("provider", string(def.ClientType)), slog.Int("models", synced))
}
}
return nil
}
@@ -1,4 +1,4 @@
package audio
package tts
// VoiceConfig is kept for backward compatibility with the legacy Edge adapter tests.
type VoiceConfig struct {
@@ -1,4 +1,4 @@
package audio
package tts
import (
"fmt"
@@ -8,43 +8,31 @@ import (
alibabaspeech "github.com/memohai/twilight-ai/provider/alibabacloud/speech"
deepgramspeech "github.com/memohai/twilight-ai/provider/deepgram/speech"
deepgramtranscription "github.com/memohai/twilight-ai/provider/deepgram/transcription"
edgespeech "github.com/memohai/twilight-ai/provider/edge/speech"
elevenlabsspeech "github.com/memohai/twilight-ai/provider/elevenlabs/speech"
elevenlabstranscription "github.com/memohai/twilight-ai/provider/elevenlabs/transcription"
googletranscription "github.com/memohai/twilight-ai/provider/google/transcription"
microsoftspeech "github.com/memohai/twilight-ai/provider/microsoft/speech"
minimaxspeech "github.com/memohai/twilight-ai/provider/minimax/speech"
openaispeech "github.com/memohai/twilight-ai/provider/openai/speech"
openaitranscription "github.com/memohai/twilight-ai/provider/openai/transcription"
openrouterspeech "github.com/memohai/twilight-ai/provider/openrouter/speech"
openroutertranscription "github.com/memohai/twilight-ai/provider/openrouter/transcription"
volcenginespeech "github.com/memohai/twilight-ai/provider/volcengine/speech"
sdk "github.com/memohai/twilight-ai/sdk"
"github.com/memohai/memoh/internal/models"
)
type (
ProviderFactory func(config map[string]any) (sdk.SpeechProvider, error)
TranscriptionProviderFactory func(config map[string]any) (sdk.TranscriptionProvider, error)
)
type ProviderFactory func(config map[string]any) (sdk.SpeechProvider, error)
type ProviderDefinition struct {
ClientType models.ClientType
DisplayName string
Icon string
Description string
ConfigSchema ConfigSchema
DefaultModel string
SupportsList bool
Models []ModelInfo
Factory ProviderFactory
DefaultTranscriptionModel string
SupportsTranscriptionList bool
TranscriptionModels []ModelInfo
TranscriptionFactory TranscriptionProviderFactory
Order int
ClientType models.ClientType
DisplayName string
Icon string
Description string
ConfigSchema ConfigSchema
DefaultModel string
SupportsList bool
Models []ModelInfo
Factory ProviderFactory
Order int
}
type Registry struct {
@@ -53,60 +41,11 @@ type Registry struct {
ordered []models.ClientType
}
func isTranscriptionClientType(clientType models.ClientType) bool {
switch clientType {
case
models.ClientTypeOpenAITranscription,
models.ClientTypeOpenRouterTranscription,
models.ClientTypeElevenLabsTranscription,
models.ClientTypeDeepgramTranscription,
models.ClientTypeGoogleTranscription:
return true
default:
return false
}
}
func speechToTranscriptionClientType(clientType models.ClientType) models.ClientType {
switch clientType {
case models.ClientTypeOpenAISpeech:
return models.ClientTypeOpenAITranscription
case models.ClientTypeOpenRouterSpeech:
return models.ClientTypeOpenRouterTranscription
case models.ClientTypeElevenLabsSpeech:
return models.ClientTypeElevenLabsTranscription
case models.ClientTypeDeepgramSpeech:
return models.ClientTypeDeepgramTranscription
case models.ClientTypeGoogleSpeech:
return models.ClientTypeGoogleTranscription
default:
return ""
}
}
func transcriptionDisplayName(displayName string) string {
displayName = strings.TrimSpace(displayName)
if displayName == "Google Speech" {
return "Google Transcription"
}
if strings.HasSuffix(displayName, " Speech") {
return strings.TrimSuffix(displayName, " Speech") + " Transcription"
}
return displayName + " Transcription"
}
func NewRegistry() *Registry {
r := &Registry{
providers: make(map[models.ClientType]ProviderDefinition),
}
baseDefs := defaultProviderDefinitions()
for _, def := range baseDefs {
if def.Factory == nil && def.TranscriptionFactory != nil {
continue
}
r.Register(def)
}
for _, def := range transcriptionProviderDefinitions(baseDefs) {
for _, def := range defaultProviderDefinitions() {
r.Register(def)
}
return r
@@ -155,98 +94,17 @@ func (r *Registry) ListMeta() []ProviderMetaResponse {
metas := make([]ProviderMetaResponse, 0, len(defs))
for _, def := range defs {
metas = append(metas, ProviderMetaResponse{
Provider: string(def.ClientType),
DisplayName: def.DisplayName,
Description: def.Description,
ConfigSchema: def.ConfigSchema,
DefaultModel: def.DefaultModel,
Models: def.Models,
DefaultSynthesisModel: def.DefaultModel,
SynthesisModels: def.Models,
SupportsSynthesisList: def.SupportsList,
DefaultTranscriptionModel: def.DefaultTranscriptionModel,
TranscriptionModels: def.TranscriptionModels,
SupportsTranscriptionList: def.SupportsTranscriptionList,
Provider: string(def.ClientType),
DisplayName: def.DisplayName,
Description: def.Description,
ConfigSchema: def.ConfigSchema,
DefaultModel: def.DefaultModel,
Models: def.Models,
})
}
return metas
}
func (r *Registry) ListSpeechMeta() []ProviderMetaResponse {
defs := r.List()
metas := make([]ProviderMetaResponse, 0, len(defs))
for _, def := range defs {
if def.Factory == nil {
continue
}
metas = append(metas, ProviderMetaResponse{
Provider: string(def.ClientType),
DisplayName: def.DisplayName,
Description: def.Description,
ConfigSchema: def.ConfigSchema,
DefaultModel: def.DefaultModel,
Models: def.Models,
DefaultSynthesisModel: def.DefaultModel,
SynthesisModels: def.Models,
SupportsSynthesisList: def.SupportsList,
})
}
return metas
}
func (r *Registry) ListTranscriptionMeta() []ProviderMetaResponse {
defs := r.List()
metas := make([]ProviderMetaResponse, 0, len(defs))
for _, def := range defs {
if def.TranscriptionFactory == nil || !isTranscriptionClientType(def.ClientType) {
continue
}
modelsList := def.TranscriptionModels
if len(modelsList) == 0 {
modelsList = def.Models
}
metas = append(metas, ProviderMetaResponse{
Provider: string(def.ClientType),
DisplayName: def.DisplayName,
Description: def.Description,
ConfigSchema: def.ConfigSchema,
DefaultModel: def.DefaultTranscriptionModel,
Models: modelsList,
DefaultTranscriptionModel: def.DefaultTranscriptionModel,
TranscriptionModels: modelsList,
SupportsTranscriptionList: def.SupportsTranscriptionList,
})
}
return metas
}
func transcriptionProviderDefinitions(base []ProviderDefinition) []ProviderDefinition {
out := make([]ProviderDefinition, 0, len(base))
for _, def := range base {
clientType := speechToTranscriptionClientType(def.ClientType)
if clientType == "" || def.TranscriptionFactory == nil {
continue
}
modelsList := def.TranscriptionModels
out = append(out, ProviderDefinition{
ClientType: clientType,
DisplayName: transcriptionDisplayName(def.DisplayName),
Icon: def.Icon,
Description: strings.TrimSpace(def.Description),
ConfigSchema: def.ConfigSchema,
DefaultModel: def.DefaultTranscriptionModel,
SupportsList: def.SupportsTranscriptionList,
Models: modelsList,
DefaultTranscriptionModel: def.DefaultTranscriptionModel,
SupportsTranscriptionList: def.SupportsTranscriptionList,
TranscriptionModels: modelsList,
TranscriptionFactory: def.TranscriptionFactory,
Order: def.Order + 1,
})
}
return out
}
func defaultProviderDefinitions() []ProviderDefinition {
edgeVoices := make([]VoiceInfo, 0)
for lang, ids := range edgespeech.EdgeTTSVoices {
@@ -315,10 +173,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
secretField("api_key", "API Key", "Bearer API key", true, 10),
stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.openai.com/v1", 20),
}},
DefaultModel: "gpt-4o-mini-tts",
SupportsList: true,
DefaultTranscriptionModel: "gpt-4o-mini-transcribe",
SupportsTranscriptionList: true,
DefaultModel: "gpt-4o-mini-tts",
SupportsList: true,
Models: []ModelInfo{{
ID: "gpt-4o-mini-tts",
Name: "gpt-4o-mini-tts",
@@ -339,23 +195,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
Formats: []string{"mp3", "opus", "pcm", "wav"},
},
}},
TranscriptionModels: []ModelInfo{{
ID: "gpt-4o-mini-transcribe",
Name: "gpt-4o-mini-transcribe",
Description: "Default OpenAI transcription model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("language", "Language", "Optional ISO language hint", false, "", 10),
stringField("prompt", "Prompt", "Optional prompt to guide transcription", false, "", 20),
numberField("temperature", "Temperature", "Sampling temperature", false, 0, 30),
enumField("response_format", "Response Format", "Transcription response format", false, []string{"json", "verbose_json", "text", "srt", "vtt"}, 40),
}},
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("language", "Language", "Optional ISO language hint", false, "", 10),
stringField("prompt", "Prompt", "Optional prompt to guide transcription", false, "", 20),
numberField("temperature", "Temperature", "Sampling temperature", false, 0, 30),
enumField("response_format", "Response Format", "Transcription response format", false, []string{"json", "verbose_json", "text", "srt", "vtt"}, 40),
}}},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []openaispeech.Option{}
if v := configString(config, "api_key"); v != "" {
@@ -366,16 +205,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
}
return openaispeech.New(opts...), nil
},
TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
opts := []openaitranscription.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, openaitranscription.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, openaitranscription.WithBaseURL(v))
}
return openaitranscription.New(opts...), nil
},
Order: 20,
},
{
@@ -387,10 +216,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
secretField("api_key", "API Key", "OpenRouter API key", true, 10),
stringField("base_url", "Base URL", "Override the API base URL", false, "https://openrouter.ai/api/v1", 20),
}},
DefaultModel: "openrouter-tts",
SupportsList: true,
DefaultTranscriptionModel: "openai/gpt-4o-mini-transcribe",
SupportsTranscriptionList: true,
DefaultModel: "openrouter-tts",
SupportsList: true,
Models: []ModelInfo{{
ID: "openrouter-tts",
Name: "openrouter-tts",
@@ -407,17 +234,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
}}},
}},
TranscriptionModels: []ModelInfo{{
ID: "openai/gpt-4o-mini-transcribe",
Name: "openai/gpt-4o-mini-transcribe",
Description: "Default OpenRouter transcription model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
advancedStringField("prompt", "Prompt", "Prompt passed to the model before audio input", false, "", 10),
}},
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
advancedStringField("prompt", "Prompt", "Prompt passed to the model before audio input", false, "", 10),
}}},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []openrouterspeech.Option{}
if v := configString(config, "api_key"); v != "" {
@@ -428,16 +244,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
}
return openrouterspeech.New(opts...), nil
},
TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
opts := []openroutertranscription.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, openroutertranscription.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, openroutertranscription.WithBaseURL(v))
}
return openroutertranscription.New(opts...), nil
},
Order: 30,
},
{
@@ -449,10 +255,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
secretField("api_key", "API Key", "ElevenLabs API key", true, 10),
stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.elevenlabs.io", 20),
}},
DefaultModel: "elevenlabs-tts",
SupportsList: true,
DefaultTranscriptionModel: "scribe_v2",
SupportsTranscriptionList: true,
DefaultModel: "elevenlabs-tts",
SupportsList: true,
Models: []ModelInfo{{
ID: "elevenlabs-tts",
Name: "elevenlabs-tts",
@@ -485,25 +289,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "en-US", 110),
}}},
}},
TranscriptionModels: []ModelInfo{{
ID: "scribe_v2",
Name: "scribe_v2",
Description: "Default ElevenLabs transcription model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "", 10),
boolField("tag_audio_events", "Tag Audio Events", "Include non-speech events in timestamps", false, 20),
boolField("diarize", "Diarize", "Enable speaker diarization", false, 30),
numberField("num_speakers", "Number of Speakers", "Optional expected speaker count", false, 0, 40),
enumField("timestamps_granularity", "Timestamps Granularity", "Timestamps granularity", false, []string{"word", "character"}, 50),
}},
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "", 10),
boolField("tag_audio_events", "Tag Audio Events", "Include non-speech events in timestamps", false, 20),
boolField("diarize", "Diarize", "Enable speaker diarization", false, 30),
numberField("num_speakers", "Number of Speakers", "Optional expected speaker count", false, 0, 40),
enumField("timestamps_granularity", "Timestamps Granularity", "Timestamps granularity", false, []string{"word", "character"}, 50),
}}},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []elevenlabsspeech.Option{}
if v := configString(config, "api_key"); v != "" {
@@ -514,52 +299,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
}
return elevenlabsspeech.New(opts...), nil
},
TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
opts := []elevenlabstranscription.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, elevenlabstranscription.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, elevenlabstranscription.WithBaseURL(v))
}
return elevenlabstranscription.New(opts...), nil
},
Order: 40,
},
{
ClientType: models.ClientTypeGoogleSpeech,
DisplayName: "Google Speech",
Icon: "google-color",
Description: "Google Gemini speech transcription",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
secretField("api_key", "API Key", "Google API key", true, 10),
stringField("base_url", "Base URL", "Override the API base URL", false, "https://generativelanguage.googleapis.com/v1beta", 20),
}},
DefaultTranscriptionModel: "gemini-2.5-flash",
SupportsTranscriptionList: true,
TranscriptionModels: []ModelInfo{{
ID: "gemini-2.5-flash",
Name: "gemini-2.5-flash",
Description: "Default Google transcription model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
advancedStringField("prompt", "Prompt", "Prompt passed alongside audio", false, "", 10),
}},
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
advancedStringField("prompt", "Prompt", "Prompt passed alongside audio", false, "", 10),
}}},
}},
TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
opts := []googletranscription.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, googletranscription.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, googletranscription.WithBaseURL(v))
}
return googletranscription.New(opts...), nil
},
Order: 45,
},
{
ClientType: models.ClientTypeDeepgramSpeech,
DisplayName: "Deepgram Speech",
@@ -569,10 +310,8 @@ func defaultProviderDefinitions() []ProviderDefinition {
secretField("api_key", "API Key", "Deepgram API key", true, 10),
stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.deepgram.com", 20),
}},
DefaultModel: "deepgram-tts",
SupportsList: false,
DefaultTranscriptionModel: "nova-3",
SupportsTranscriptionList: false,
DefaultModel: "deepgram-tts",
SupportsList: false,
Models: []ModelInfo{{
ID: "deepgram-tts",
Name: "deepgram-tts",
@@ -593,25 +332,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
Formats: []string{"wav", "none"},
},
}},
TranscriptionModels: []ModelInfo{{
ID: "nova-3",
Name: "nova-3",
Description: "Default Deepgram transcription model",
ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("language", "Language", "Optional language hint", false, "", 10),
boolField("smart_format", "Smart Format", "Enable smart formatting", false, 20),
boolField("detect_language", "Detect Language", "Enable automatic language detection", false, 30),
boolField("diarize", "Diarize", "Enable speaker diarization", false, 40),
boolField("punctuate", "Punctuate", "Enable punctuation", false, 50),
}},
Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
stringField("language", "Language", "Optional language hint", false, "", 10),
boolField("smart_format", "Smart Format", "Enable smart formatting", false, 20),
boolField("detect_language", "Detect Language", "Enable automatic language detection", false, 30),
boolField("diarize", "Diarize", "Enable speaker diarization", false, 40),
boolField("punctuate", "Punctuate", "Enable punctuation", false, 50),
}}},
}},
Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
opts := []deepgramspeech.Option{}
if v := configString(config, "api_key"); v != "" {
@@ -622,16 +342,6 @@ func defaultProviderDefinitions() []ProviderDefinition {
}
return deepgramspeech.New(opts...), nil
},
TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
opts := []deepgramtranscription.Option{}
if v := configString(config, "api_key"); v != "" {
opts = append(opts, deepgramtranscription.WithAPIKey(v))
}
if v := configString(config, "base_url"); v != "" {
opts = append(opts, deepgramtranscription.WithBaseURL(v))
}
return deepgramtranscription.New(opts...), nil
},
Order: 50,
},
{
+435
View File
@@ -0,0 +1,435 @@
package tts
import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
sdk "github.com/memohai/twilight-ai/sdk"
"github.com/memohai/memoh/internal/db"
"github.com/memohai/memoh/internal/db/sqlc"
"github.com/memohai/memoh/internal/models"
)
type Service struct {
queries *sqlc.Queries
logger *slog.Logger
registry *Registry
}
func NewService(log *slog.Logger, queries *sqlc.Queries, registry *Registry) *Service {
return &Service{
queries: queries,
logger: log.With(slog.String("service", "tts")),
registry: registry,
}
}
func (s *Service) Registry() *Registry { return s.registry }
func (s *Service) ListMeta(_ context.Context) []ProviderMetaResponse {
return s.registry.ListMeta()
}
func (s *Service) ListSpeechProviders(ctx context.Context) ([]SpeechProviderResponse, error) {
rows, err := s.queries.ListSpeechProviders(ctx)
if err != nil {
return nil, fmt.Errorf("list speech providers: %w", err)
}
items := make([]SpeechProviderResponse, 0, len(rows))
for _, row := range rows {
items = append(items, toSpeechProviderResponse(row))
}
return items, nil
}
func (s *Service) GetSpeechProvider(ctx context.Context, id string) (SpeechProviderResponse, error) {
pgID, err := db.ParseUUID(id)
if err != nil {
return SpeechProviderResponse{}, err
}
row, err := s.queries.GetProviderByID(ctx, pgID)
if err != nil {
return SpeechProviderResponse{}, fmt.Errorf("get speech provider: %w", err)
}
return toSpeechProviderResponse(row), nil
}
func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse, error) {
rows, err := s.queries.ListSpeechModels(ctx)
if err != nil {
return nil, fmt.Errorf("list speech models: %w", err)
}
items := make([]SpeechModelResponse, 0, len(rows))
for _, row := range rows {
if s.shouldHideModel(row.ProviderType, row.ModelID) {
continue
}
items = append(items, toSpeechModelFromListRow(row))
}
return items, nil
}
func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID string) ([]SpeechModelResponse, error) {
pgID, err := db.ParseUUID(providerID)
if err != nil {
return nil, err
}
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
rows, err := s.queries.ListSpeechModelsByProviderID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("list speech models by provider: %w", err)
}
items := make([]SpeechModelResponse, 0, len(rows))
for _, row := range rows {
if shouldHideTemplateModel(def, row.ModelID) {
continue
}
items = append(items, toSpeechModelFromModel(row, ""))
}
return items, nil
}
func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelResponse, error) {
pgID, err := db.ParseUUID(id)
if err != nil {
return SpeechModelResponse{}, err
}
row, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
if err != nil {
return SpeechModelResponse{}, fmt.Errorf("get speech model: %w", err)
}
return toSpeechModelWithProviderResponse(row), nil
}
func (s *Service) Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error) {
params, err := s.resolveSpeechParams(ctx, modelID, text, overrideCfg)
if err != nil {
return nil, "", err
}
result, err := sdk.GenerateSpeech(ctx,
sdk.WithSpeechModel(params.model),
sdk.WithText(text),
sdk.WithSpeechConfig(params.config),
)
if err != nil {
return nil, "", fmt.Errorf("synthesize: %w", err)
}
return result.Audio, result.ContentType, nil
}
func (s *Service) StreamToFile(ctx context.Context, modelID string, text string, w io.Writer) (string, error) {
params, err := s.resolveSpeechParams(ctx, modelID, text, nil)
if err != nil {
return "", err
}
streamResult, err := sdk.StreamSpeech(ctx,
sdk.WithSpeechModel(params.model),
sdk.WithText(text),
sdk.WithSpeechConfig(params.config),
)
if err != nil {
return "", fmt.Errorf("stream: %w", err)
}
audio, err := streamResult.Bytes()
if err != nil {
return "", fmt.Errorf("stream: %w", err)
}
if _, writeErr := w.Write(audio); writeErr != nil {
return "", fmt.Errorf("write chunk: %w", writeErr)
}
return streamResult.ContentType, nil
}
func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
pgID, err := db.ParseUUID(modelID)
if err != nil {
return nil, err
}
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech model: %w", err)
}
def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
if err != nil {
return nil, err
}
template := findModelTemplate(def, modelRow.ModelID)
if template == nil {
return nil, fmt.Errorf("speech model capabilities not found: %s", modelRow.ModelID)
}
caps := template.Capabilities
if len(caps.ConfigSchema.Fields) == 0 {
caps.ConfigSchema = template.ConfigSchema
}
return &caps, nil
}
func (s *Service) FetchRemoteModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
pgID, err := db.ParseUUID(providerID)
if err != nil {
return nil, err
}
providerRow, err := s.queries.GetProviderByID(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
if !def.SupportsList || def.Factory == nil {
return nil, fmt.Errorf("speech provider does not support model discovery: %s", providerRow.ClientType)
}
provider, err := def.Factory(parseConfig(providerRow.Config))
if err != nil {
return nil, fmt.Errorf("build speech provider: %w", err)
}
remoteModels, err := provider.ListModels(ctx)
if err != nil {
return nil, fmt.Errorf("list speech models: %w", err)
}
discovered := make([]ModelInfo, 0, len(remoteModels))
for _, remoteModel := range remoteModels {
if remoteModel == nil || remoteModel.ID == "" {
continue
}
discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.Models))
}
return discovered, nil
}
type resolvedSpeechParams struct {
model *sdk.SpeechModel
config map[string]any
}
func (s *Service) resolveSpeechParams(ctx context.Context, modelID string, text string, overrideCfg map[string]any) (*resolvedSpeechParams, error) {
_ = text
pgID, err := db.ParseUUID(modelID)
if err != nil {
return nil, err
}
modelRow, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
if err != nil {
return nil, fmt.Errorf("get speech model: %w", err)
}
providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
if err != nil {
return nil, fmt.Errorf("get speech provider: %w", err)
}
def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
if err != nil {
return nil, err
}
provider, err := def.Factory(parseConfig(providerRow.Config))
if err != nil {
return nil, fmt.Errorf("build speech provider: %w", err)
}
cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
return &resolvedSpeechParams{
model: &sdk.SpeechModel{ID: modelRow.ModelID, Provider: provider},
config: cfg,
}, nil
}
func parseConfig(raw []byte) map[string]any {
if len(raw) == 0 {
return map[string]any{}
}
var cfg map[string]any
if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil {
return map[string]any{}
}
return cfg
}
func mergeConfig(parts ...map[string]any) map[string]any {
out := make(map[string]any)
for _, part := range parts {
for key, value := range part {
out[key] = value
}
}
return out
}
func mergeRemoteModelInfo(modelID string, defaults []ModelInfo) ModelInfo {
for _, model := range defaults {
if model.ID == modelID {
return model
}
}
return ModelInfo{
ID: modelID,
Name: modelID,
}
}
func (s *Service) shouldHideModel(clientType string, modelID string) bool {
def, err := s.registry.Get(models.ClientType(clientType))
if err != nil {
return false
}
return shouldHideTemplateModel(def, modelID)
}
func shouldHideTemplateModel(def ProviderDefinition, modelID string) bool {
if !def.SupportsList {
return false
}
for _, model := range def.Models {
if model.ID == modelID {
return model.TemplateOnly
}
}
return false
}
func findModelTemplate(def ProviderDefinition, modelID string) *ModelInfo {
for i := range def.Models {
if def.Models[i].ID == modelID {
return &def.Models[i]
}
}
if def.DefaultModel != "" {
for i := range def.Models {
if def.Models[i].ID == def.DefaultModel {
return &def.Models[i]
}
}
}
if len(def.Models) > 0 {
return &def.Models[0]
}
return nil
}
func toSpeechProviderResponse(row sqlc.Provider) SpeechProviderResponse {
icon := ""
if row.Icon.Valid {
icon = row.Icon.String
}
return SpeechProviderResponse{
ID: row.ID.String(),
Name: row.Name,
ClientType: row.ClientType,
Icon: icon,
Enable: row.Enable,
Config: maskSpeechProviderConfig(parseConfig(row.Config)),
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
func maskSpeechProviderConfig(cfg map[string]any) map[string]any {
if len(cfg) == 0 {
return map[string]any{}
}
out := make(map[string]any, len(cfg))
for key, value := range cfg {
if s, ok := value.(string); ok && s != "" && isSpeechSecretKey(key) {
out[key] = maskSpeechSecret(s)
continue
}
out[key] = value
}
return out
}
func isSpeechSecretKey(key string) bool {
switch key {
case "api_key", "access_key", "secret_key", "app_key":
return true
default:
return false
}
}
func maskSpeechSecret(value string) string {
if len(value) <= 8 {
return "********"
}
return value[:4] + "****" + value[len(value)-4:]
}
func toSpeechModelFromListRow(row sqlc.ListSpeechModelsRow) SpeechModelResponse {
var cfg map[string]any
if len(row.Config) > 0 {
_ = json.Unmarshal(row.Config, &cfg)
}
name := ""
if row.Name.Valid {
name = row.Name.String
}
return SpeechModelResponse{
ID: row.ID.String(),
ModelID: row.ModelID,
Name: name,
ProviderID: row.ProviderID.String(),
ProviderType: row.ProviderType,
Config: cfg,
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
func toSpeechModelFromModel(row sqlc.Model, providerType string) SpeechModelResponse {
var cfg map[string]any
if len(row.Config) > 0 {
_ = json.Unmarshal(row.Config, &cfg)
}
name := ""
if row.Name.Valid {
name = row.Name.String
}
return SpeechModelResponse{
ID: row.ID.String(),
ModelID: row.ModelID,
Name: name,
ProviderID: row.ProviderID.String(),
ProviderType: providerType,
Config: cfg,
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
func toSpeechModelWithProviderResponse(row sqlc.GetSpeechModelWithProviderRow) SpeechModelResponse {
var cfg map[string]any
if len(row.Config) > 0 {
_ = json.Unmarshal(row.Config, &cfg)
}
name := ""
if row.Name.Valid {
name = row.Name.String
}
return SpeechModelResponse{
ID: row.ID.String(),
ModelID: row.ModelID,
Name: name,
ProviderID: row.ProviderID.String(),
ProviderType: row.ProviderType,
Config: cfg,
CreatedAt: row.CreatedAt.Time,
UpdatedAt: row.UpdatedAt.Time,
}
}
@@ -1,4 +1,4 @@
package audio
package tts
import (
"fmt"
@@ -13,7 +13,7 @@ import (
const (
defaultTTL = 10 * time.Minute
cleanupInterval = 1 * time.Minute
tempDirName = "audio_temp"
tempDirName = "tts_temp"
)
// TempStore manages temporary audio files on disk with automatic TTL-based cleanup.
@@ -30,7 +30,7 @@ type TempStore struct {
func NewTempStore(baseDir string) (*TempStore, error) {
dir := filepath.Join(baseDir, tempDirName)
if err := os.MkdirAll(dir, 0o750); err != nil {
return nil, fmt.Errorf("create audio temp dir: %w", err)
return nil, fmt.Errorf("create tts temp dir: %w", err)
}
return &TempStore{
dir: dir,
+62
View File
@@ -0,0 +1,62 @@
package tts
import "time"
// ProviderMetaResponse exposes adapter metadata (from the registry, not DB).
type ProviderMetaResponse struct {
Provider string `json:"provider"`
DisplayName string `json:"display_name"`
Description string `json:"description"`
ConfigSchema ConfigSchema `json:"config_schema,omitempty"`
DefaultModel string `json:"default_model"`
Models []ModelInfo `json:"models"`
}
// SpeechProviderResponse represents a speech-capable provider from the unified providers table.
type SpeechProviderResponse struct {
ID string `json:"id"`
Name string `json:"name"`
ClientType string `json:"client_type"`
Icon string `json:"icon,omitempty"`
Enable bool `json:"enable"`
Config map[string]any `json:"config,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// SpeechModelResponse represents a speech model from the unified models table.
type SpeechModelResponse struct {
ID string `json:"id"`
ModelID string `json:"model_id"`
Name string `json:"name"`
ProviderID string `json:"provider_id"`
ProviderType string `json:"provider_type,omitempty"`
Config map[string]any `json:"config,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// UpdateSpeechProviderRequest is used for updating a speech provider.
type UpdateSpeechProviderRequest struct {
Name *string `json:"name,omitempty"`
Enable *bool `json:"enable,omitempty"`
}
// UpdateSpeechModelRequest is used for updating a speech model.
type UpdateSpeechModelRequest struct {
Name *string `json:"name,omitempty"`
Config map[string]any `json:"config,omitempty"`
}
// TestSynthesizeRequest represents a text-to-speech test request.
type TestSynthesizeRequest struct {
Text string `json:"text"`
Config map[string]any `json:"config,omitempty"`
}
// ImportModelsResponse represents the response for importing speech models.
type ImportModelsResponse struct {
Created int `json:"created"`
Skipped int `json:"skipped"`
Models []string `json:"models"`
}
-2
View File
@@ -175,7 +175,6 @@ func withWorkspaceGPUPreference(metadata map[string]any, gpu WorkspaceGPUConfig)
return next
}
//nolint:unused // Kept for tests and upcoming metadata plumbing.
func withWorkspaceSkillDiscoveryRoots(metadata map[string]any, roots []string) map[string]any {
next := cloneAnyMap(metadata)
section := workspaceSection(next)
@@ -200,7 +199,6 @@ func withoutWorkspaceGPUPreference(metadata map[string]any) map[string]any {
return next
}
//nolint:unused // Kept for tests and upcoming metadata plumbing.
func withoutWorkspaceSkillDiscoveryRoots(metadata map[string]any) map[string]any {
next := cloneAnyMap(metadata)
section := workspaceSection(next)
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+107 -502
View File
@@ -310,146 +310,6 @@ export type AdaptersUsageResponse = {
total_text_bytes?: number;
};
export type AudioConfigSchema = {
fields?: Array<AudioFieldSchema>;
};
export type AudioFieldSchema = {
advanced?: boolean;
description?: string;
enum?: Array<string>;
example?: unknown;
key?: string;
order?: number;
required?: boolean;
title?: string;
type?: string;
};
export type AudioImportModelsResponse = {
created?: number;
models?: Array<string>;
skipped?: number;
};
export type AudioModelCapabilities = {
config_schema?: AudioConfigSchema;
formats?: Array<string>;
metadata?: {
[key: string]: string;
};
pitch?: AudioParamConstraint;
speed?: AudioParamConstraint;
voices?: Array<AudioVoiceInfo>;
};
export type AudioModelInfo = {
capabilities?: AudioModelCapabilities;
config_schema?: AudioConfigSchema;
description?: string;
id?: string;
name?: string;
template_only?: boolean;
};
export type AudioParamConstraint = {
default?: number;
max?: number;
min?: number;
options?: Array<number>;
};
export type AudioProviderMetaResponse = {
config_schema?: AudioConfigSchema;
default_model?: string;
default_synthesis_model?: string;
default_transcription_model?: string;
description?: string;
display_name?: string;
models?: Array<AudioModelInfo>;
provider?: string;
supports_synthesis_list?: boolean;
supports_transcription_list?: boolean;
synthesis_models?: Array<AudioModelInfo>;
transcription_models?: Array<AudioModelInfo>;
};
export type AudioSpeechModelResponse = {
config?: {
[key: string]: unknown;
};
created_at?: string;
id?: string;
model_id?: string;
name?: string;
provider_id?: string;
provider_type?: string;
updated_at?: string;
};
export type AudioSpeechProviderResponse = {
client_type?: string;
config?: {
[key: string]: unknown;
};
created_at?: string;
enable?: boolean;
icon?: string;
id?: string;
name?: string;
updated_at?: string;
};
export type AudioTestSynthesizeRequest = {
config?: {
[key: string]: unknown;
};
text?: string;
};
export type AudioTestTranscriptionResponse = {
duration_seconds?: number;
language?: string;
metadata?: {
[key: string]: unknown;
};
text?: string;
words?: Array<AudioTranscriptionWord>;
};
export type AudioTranscriptionModelResponse = {
config?: {
[key: string]: unknown;
};
created_at?: string;
id?: string;
model_id?: string;
name?: string;
provider_id?: string;
provider_type?: string;
updated_at?: string;
};
export type AudioTranscriptionWord = {
end?: number;
speaker_id?: string;
start?: number;
text?: string;
};
export type AudioUpdateSpeechModelRequest = {
config?: {
[key: string]: unknown;
};
name?: string;
};
export type AudioVoiceInfo = {
id?: string;
lang?: string;
name?: string;
};
export type BotsBot = {
avatar_url?: string;
check_issue_count?: number;
@@ -613,7 +473,7 @@ export type ChannelChannelIdentityBinding = {
updated_at?: string;
};
export type ChannelChannelType = 'telegram' | 'feishu' | 'dingtalk' | 'matrix' | 'discord' | 'qq' | 'wecom' | 'weixin' | 'wechatoa' | 'local' | 'slack';
export type ChannelChannelType = 'telegram' | 'feishu' | 'dingtalk' | 'matrix' | 'discord' | 'qq' | 'wecom' | 'weixin' | 'wechatoa' | 'local';
export type ChannelConfigSchema = {
fields?: {
@@ -1494,7 +1354,7 @@ export type ModelsModelConfig = {
reasoning_efforts?: Array<string>;
};
export type ModelsModelType = 'chat' | 'embedding' | 'speech' | 'transcription';
export type ModelsModelType = 'chat' | 'embedding' | 'speech';
export type ModelsTestResponse = {
latency_ms?: number;
@@ -1755,7 +1615,6 @@ export type SettingsSettings = {
search_provider_id?: string;
timezone?: string;
title_model_id?: string;
transcription_model_id?: string;
tts_model_id?: string;
};
@@ -1780,10 +1639,105 @@ export type SettingsUpsertRequest = {
search_provider_id?: string;
timezone?: string;
title_model_id?: string;
transcription_model_id?: string;
tts_model_id?: string;
};
export type TtsConfigSchema = {
fields?: Array<TtsFieldSchema>;
};
export type TtsFieldSchema = {
advanced?: boolean;
description?: string;
enum?: Array<string>;
example?: unknown;
key?: string;
order?: number;
required?: boolean;
title?: string;
type?: string;
};
export type TtsImportModelsResponse = {
created?: number;
models?: Array<string>;
skipped?: number;
};
export type TtsModelCapabilities = {
config_schema?: TtsConfigSchema;
formats?: Array<string>;
metadata?: {
[key: string]: string;
};
pitch?: TtsParamConstraint;
speed?: TtsParamConstraint;
voices?: Array<TtsVoiceInfo>;
};
export type TtsModelInfo = {
capabilities?: TtsModelCapabilities;
config_schema?: TtsConfigSchema;
description?: string;
id?: string;
name?: string;
};
export type TtsParamConstraint = {
default?: number;
max?: number;
min?: number;
options?: Array<number>;
};
export type TtsProviderMetaResponse = {
config_schema?: TtsConfigSchema;
default_model?: string;
description?: string;
display_name?: string;
models?: Array<TtsModelInfo>;
provider?: string;
};
export type TtsSpeechModelResponse = {
config?: {
[key: string]: unknown;
};
created_at?: string;
id?: string;
model_id?: string;
name?: string;
provider_id?: string;
provider_type?: string;
updated_at?: string;
};
export type TtsSpeechProviderResponse = {
client_type?: string;
config?: {
[key: string]: unknown;
};
created_at?: string;
enable?: boolean;
icon?: string;
id?: string;
name?: string;
updated_at?: string;
};
export type TtsTestSynthesizeRequest = {
config?: {
[key: string]: unknown;
};
text?: string;
};
export type TtsVoiceInfo = {
id?: string;
lang?: string;
name?: string;
};
export type PostAuthLoginData = {
/**
* Login request
@@ -8268,7 +8222,7 @@ export type GetSpeechModelsResponses = {
/**
* OK
*/
200: Array<AudioSpeechModelResponse>;
200: Array<TtsSpeechModelResponse>;
};
export type GetSpeechModelsResponse = GetSpeechModelsResponses[keyof GetSpeechModelsResponses];
@@ -8298,48 +8252,11 @@ export type GetSpeechModelsByIdResponses = {
/**
* OK
*/
200: AudioSpeechModelResponse;
200: TtsSpeechModelResponse;
};
export type GetSpeechModelsByIdResponse = GetSpeechModelsByIdResponses[keyof GetSpeechModelsByIdResponses];
export type PutSpeechModelsByIdData = {
/**
* Model update payload
*/
body: AudioUpdateSpeechModelRequest;
path: {
/**
* Model ID
*/
id: string;
};
query?: never;
url: '/speech-models/{id}';
};
export type PutSpeechModelsByIdErrors = {
/**
* Bad Request
*/
400: HandlersErrorResponse;
/**
* Internal Server Error
*/
500: HandlersErrorResponse;
};
export type PutSpeechModelsByIdError = PutSpeechModelsByIdErrors[keyof PutSpeechModelsByIdErrors];
export type PutSpeechModelsByIdResponses = {
/**
* OK
*/
200: AudioSpeechModelResponse;
};
export type PutSpeechModelsByIdResponse = PutSpeechModelsByIdResponses[keyof PutSpeechModelsByIdResponses];
export type GetSpeechModelsByIdCapabilitiesData = {
body?: never;
path: {
@@ -8365,7 +8282,7 @@ export type GetSpeechModelsByIdCapabilitiesResponses = {
/**
* OK
*/
200: AudioModelCapabilities;
200: TtsModelCapabilities;
};
export type GetSpeechModelsByIdCapabilitiesResponse = GetSpeechModelsByIdCapabilitiesResponses[keyof GetSpeechModelsByIdCapabilitiesResponses];
@@ -8374,7 +8291,7 @@ export type PostSpeechModelsByIdTestData = {
/**
* Text to synthesize
*/
body: AudioTestSynthesizeRequest;
body: TtsTestSynthesizeRequest;
path: {
/**
* Model ID
@@ -8425,7 +8342,7 @@ export type GetSpeechProvidersResponses = {
/**
* OK
*/
200: Array<AudioSpeechProviderResponse>;
200: Array<TtsSpeechProviderResponse>;
};
export type GetSpeechProvidersResponse = GetSpeechProvidersResponses[keyof GetSpeechProvidersResponses];
@@ -8441,7 +8358,7 @@ export type GetSpeechProvidersMetaResponses = {
/**
* OK
*/
200: Array<AudioProviderMetaResponse>;
200: Array<TtsProviderMetaResponse>;
};
export type GetSpeechProvidersMetaResponse = GetSpeechProvidersMetaResponses[keyof GetSpeechProvidersMetaResponses];
@@ -8475,7 +8392,7 @@ export type GetSpeechProvidersByIdResponses = {
/**
* OK
*/
200: AudioSpeechProviderResponse;
200: TtsSpeechProviderResponse;
};
export type GetSpeechProvidersByIdResponse = GetSpeechProvidersByIdResponses[keyof GetSpeechProvidersByIdResponses];
@@ -8513,7 +8430,7 @@ export type PostSpeechProvidersByIdImportModelsResponses = {
/**
* OK
*/
200: AudioImportModelsResponse;
200: TtsImportModelsResponse;
};
export type PostSpeechProvidersByIdImportModelsResponse = PostSpeechProvidersByIdImportModelsResponses[keyof PostSpeechProvidersByIdImportModelsResponses];
@@ -8547,7 +8464,7 @@ export type GetSpeechProvidersByIdModelsResponses = {
/**
* OK
*/
200: Array<AudioSpeechModelResponse>;
200: Array<TtsSpeechModelResponse>;
};
export type GetSpeechProvidersByIdModelsResponse = GetSpeechProvidersByIdModelsResponses[keyof GetSpeechProvidersByIdModelsResponses];
@@ -8733,318 +8650,6 @@ export type GetSupermarketTagsResponses = {
export type GetSupermarketTagsResponse = GetSupermarketTagsResponses[keyof GetSupermarketTagsResponses];
export type GetTranscriptionModelsData = {
body?: never;
path?: never;
query?: never;
url: '/transcription-models';
};
export type GetTranscriptionModelsErrors = {
/**
* Internal Server Error
*/
500: HandlersErrorResponse;
};
export type GetTranscriptionModelsError = GetTranscriptionModelsErrors[keyof GetTranscriptionModelsErrors];
export type GetTranscriptionModelsResponses = {
/**
* OK
*/
200: Array<AudioTranscriptionModelResponse>;
};
export type GetTranscriptionModelsResponse = GetTranscriptionModelsResponses[keyof GetTranscriptionModelsResponses];
export type GetTranscriptionModelsByIdData = {
body?: never;
path: {
/**
* Model ID
*/
id: string;
};
query?: never;
url: '/transcription-models/{id}';
};
export type GetTranscriptionModelsByIdErrors = {
/**
* Not Found
*/
404: HandlersErrorResponse;
};
export type GetTranscriptionModelsByIdError = GetTranscriptionModelsByIdErrors[keyof GetTranscriptionModelsByIdErrors];
export type GetTranscriptionModelsByIdResponses = {
/**
* OK
*/
200: AudioTranscriptionModelResponse;
};
export type GetTranscriptionModelsByIdResponse = GetTranscriptionModelsByIdResponses[keyof GetTranscriptionModelsByIdResponses];
export type PutTranscriptionModelsByIdData = {
/**
* Model update payload
*/
body: AudioUpdateSpeechModelRequest;
path: {
/**
* Model ID
*/
id: string;
};
query?: never;
url: '/transcription-models/{id}';
};
export type PutTranscriptionModelsByIdErrors = {
/**
* Bad Request
*/
400: HandlersErrorResponse;
/**
* Internal Server Error
*/
500: HandlersErrorResponse;
};
export type PutTranscriptionModelsByIdError = PutTranscriptionModelsByIdErrors[keyof PutTranscriptionModelsByIdErrors];
export type PutTranscriptionModelsByIdResponses = {
/**
* OK
*/
200: AudioTranscriptionModelResponse;
};
export type PutTranscriptionModelsByIdResponse = PutTranscriptionModelsByIdResponses[keyof PutTranscriptionModelsByIdResponses];
export type GetTranscriptionModelsByIdCapabilitiesData = {
body?: never;
path: {
/**
* Model ID
*/
id: string;
};
query?: never;
url: '/transcription-models/{id}/capabilities';
};
export type GetTranscriptionModelsByIdCapabilitiesErrors = {
/**
* Not Found
*/
404: HandlersErrorResponse;
};
export type GetTranscriptionModelsByIdCapabilitiesError = GetTranscriptionModelsByIdCapabilitiesErrors[keyof GetTranscriptionModelsByIdCapabilitiesErrors];
export type GetTranscriptionModelsByIdCapabilitiesResponses = {
/**
* OK
*/
200: AudioModelCapabilities;
};
export type GetTranscriptionModelsByIdCapabilitiesResponse = GetTranscriptionModelsByIdCapabilitiesResponses[keyof GetTranscriptionModelsByIdCapabilitiesResponses];
export type PostTranscriptionModelsByIdTestData = {
body: {
/**
* Audio file
*/
file: Blob | File;
/**
* Optional JSON config
*/
config?: string;
};
path: {
/**
* Model ID
*/
id: string;
};
query?: never;
url: '/transcription-models/{id}/test';
};
export type PostTranscriptionModelsByIdTestErrors = {
/**
* Bad Request
*/
400: HandlersErrorResponse;
/**
* Internal Server Error
*/
500: HandlersErrorResponse;
};
export type PostTranscriptionModelsByIdTestError = PostTranscriptionModelsByIdTestErrors[keyof PostTranscriptionModelsByIdTestErrors];
export type PostTranscriptionModelsByIdTestResponses = {
/**
* OK
*/
200: AudioTestTranscriptionResponse;
};
export type PostTranscriptionModelsByIdTestResponse = PostTranscriptionModelsByIdTestResponses[keyof PostTranscriptionModelsByIdTestResponses];
export type GetTranscriptionProvidersData = {
body?: never;
path?: never;
query?: never;
url: '/transcription-providers';
};
export type GetTranscriptionProvidersErrors = {
/**
* Internal Server Error
*/
500: HandlersErrorResponse;
};
export type GetTranscriptionProvidersError = GetTranscriptionProvidersErrors[keyof GetTranscriptionProvidersErrors];
export type GetTranscriptionProvidersResponses = {
/**
* OK
*/
200: Array<AudioSpeechProviderResponse>;
};
export type GetTranscriptionProvidersResponse = GetTranscriptionProvidersResponses[keyof GetTranscriptionProvidersResponses];
export type GetTranscriptionProvidersMetaData = {
body?: never;
path?: never;
query?: never;
url: '/transcription-providers/meta';
};
export type GetTranscriptionProvidersMetaResponses = {
/**
* OK
*/
200: Array<AudioProviderMetaResponse>;
};
export type GetTranscriptionProvidersMetaResponse = GetTranscriptionProvidersMetaResponses[keyof GetTranscriptionProvidersMetaResponses];
export type GetTranscriptionProvidersByIdData = {
body?: never;
path: {
/**
* Provider ID (UUID)
*/
id: string;
};
query?: never;
url: '/transcription-providers/{id}';
};
export type GetTranscriptionProvidersByIdErrors = {
/**
* Bad Request
*/
400: HandlersErrorResponse;
/**
* Not Found
*/
404: HandlersErrorResponse;
};
export type GetTranscriptionProvidersByIdError = GetTranscriptionProvidersByIdErrors[keyof GetTranscriptionProvidersByIdErrors];
export type GetTranscriptionProvidersByIdResponses = {
/**
* OK
*/
200: AudioSpeechProviderResponse;
};
export type GetTranscriptionProvidersByIdResponse = GetTranscriptionProvidersByIdResponses[keyof GetTranscriptionProvidersByIdResponses];
export type PostTranscriptionProvidersByIdImportModelsData = {
body?: never;
path: {
/**
* Provider ID (UUID)
*/
id: string;
};
query?: never;
url: '/transcription-providers/{id}/import-models';
};
export type PostTranscriptionProvidersByIdImportModelsErrors = {
/**
* Bad Request
*/
400: HandlersErrorResponse;
/**
* Not Found
*/
404: HandlersErrorResponse;
/**
* Internal Server Error
*/
500: HandlersErrorResponse;
};
export type PostTranscriptionProvidersByIdImportModelsError = PostTranscriptionProvidersByIdImportModelsErrors[keyof PostTranscriptionProvidersByIdImportModelsErrors];
export type PostTranscriptionProvidersByIdImportModelsResponses = {
/**
* OK
*/
200: AudioImportModelsResponse;
};
export type PostTranscriptionProvidersByIdImportModelsResponse = PostTranscriptionProvidersByIdImportModelsResponses[keyof PostTranscriptionProvidersByIdImportModelsResponses];
export type GetTranscriptionProvidersByIdModelsData = {
body?: never;
path: {
/**
* Provider ID (UUID)
*/
id: string;
};
query?: never;
url: '/transcription-providers/{id}/models';
};
export type GetTranscriptionProvidersByIdModelsErrors = {
/**
* Bad Request
*/
400: HandlersErrorResponse;
/**
* Internal Server Error
*/
500: HandlersErrorResponse;
};
export type GetTranscriptionProvidersByIdModelsError = GetTranscriptionProvidersByIdModelsErrors[keyof GetTranscriptionProvidersByIdModelsErrors];
export type GetTranscriptionProvidersByIdModelsResponses = {
/**
* OK
*/
200: Array<AudioTranscriptionModelResponse>;
};
export type GetTranscriptionProvidersByIdModelsResponse = GetTranscriptionProvidersByIdModelsResponses[keyof GetTranscriptionProvidersByIdModelsResponses];
export type GetUsersData = {
body?: never;
path?: never;
+257 -815
View File
File diff suppressed because it is too large Load Diff
+257 -815
View File
File diff suppressed because it is too large Load Diff
+170 -545
View File
@@ -489,240 +489,6 @@ definitions:
total_text_bytes:
type: integer
type: object
audio.ConfigSchema:
properties:
fields:
items:
$ref: '#/definitions/audio.FieldSchema'
type: array
type: object
audio.FieldSchema:
properties:
advanced:
type: boolean
description:
type: string
enum:
items:
type: string
type: array
example: {}
key:
type: string
order:
type: integer
required:
type: boolean
title:
type: string
type:
type: string
type: object
audio.ImportModelsResponse:
properties:
created:
type: integer
models:
items:
type: string
type: array
skipped:
type: integer
type: object
audio.ModelCapabilities:
properties:
config_schema:
$ref: '#/definitions/audio.ConfigSchema'
formats:
items:
type: string
type: array
metadata:
additionalProperties:
type: string
type: object
pitch:
$ref: '#/definitions/audio.ParamConstraint'
speed:
$ref: '#/definitions/audio.ParamConstraint'
voices:
items:
$ref: '#/definitions/audio.VoiceInfo'
type: array
type: object
audio.ModelInfo:
properties:
capabilities:
$ref: '#/definitions/audio.ModelCapabilities'
config_schema:
$ref: '#/definitions/audio.ConfigSchema'
description:
type: string
id:
type: string
name:
type: string
template_only:
type: boolean
type: object
audio.ParamConstraint:
properties:
default:
type: number
max:
type: number
min:
type: number
options:
items:
type: number
type: array
type: object
audio.ProviderMetaResponse:
properties:
config_schema:
$ref: '#/definitions/audio.ConfigSchema'
default_model:
type: string
default_synthesis_model:
type: string
default_transcription_model:
type: string
description:
type: string
display_name:
type: string
models:
items:
$ref: '#/definitions/audio.ModelInfo'
type: array
provider:
type: string
supports_synthesis_list:
type: boolean
supports_transcription_list:
type: boolean
synthesis_models:
items:
$ref: '#/definitions/audio.ModelInfo'
type: array
transcription_models:
items:
$ref: '#/definitions/audio.ModelInfo'
type: array
type: object
audio.SpeechModelResponse:
properties:
config:
additionalProperties: {}
type: object
created_at:
type: string
id:
type: string
model_id:
type: string
name:
type: string
provider_id:
type: string
provider_type:
type: string
updated_at:
type: string
type: object
audio.SpeechProviderResponse:
properties:
client_type:
type: string
config:
additionalProperties: {}
type: object
created_at:
type: string
enable:
type: boolean
icon:
type: string
id:
type: string
name:
type: string
updated_at:
type: string
type: object
audio.TestSynthesizeRequest:
properties:
config:
additionalProperties: {}
type: object
text:
type: string
type: object
audio.TestTranscriptionResponse:
properties:
duration_seconds:
type: number
language:
type: string
metadata:
additionalProperties: {}
type: object
text:
type: string
words:
items:
$ref: '#/definitions/audio.TranscriptionWord'
type: array
type: object
audio.TranscriptionModelResponse:
properties:
config:
additionalProperties: {}
type: object
created_at:
type: string
id:
type: string
model_id:
type: string
name:
type: string
provider_id:
type: string
provider_type:
type: string
updated_at:
type: string
type: object
audio.TranscriptionWord:
properties:
end:
type: number
speaker_id:
type: string
start:
type: number
text:
type: string
type: object
audio.UpdateSpeechModelRequest:
properties:
config:
additionalProperties: {}
type: object
name:
type: string
type: object
audio.VoiceInfo:
properties:
id:
type: string
lang:
type: string
name:
type: string
type: object
bots.Bot:
properties:
avatar_url:
@@ -1008,7 +774,6 @@ definitions:
- weixin
- wechatoa
- local
- slack
type: string
x-enum-varnames:
- ChannelTypeTelegram
@@ -1021,7 +786,6 @@ definitions:
- ChannelTypeWeixin
- ChannelTypeWeChatOA
- ChannelTypeLocal
- ChannelTypeSlack
channel.ConfigSchema:
properties:
fields:
@@ -2498,13 +2262,11 @@ definitions:
- chat
- embedding
- speech
- transcription
type: string
x-enum-varnames:
- ModelTypeChat
- ModelTypeEmbedding
- ModelTypeSpeech
- ModelTypeTranscription
models.TestResponse:
properties:
latency_ms:
@@ -2951,8 +2713,6 @@ definitions:
type: string
title_model_id:
type: string
transcription_model_id:
type: string
tts_model_id:
type: string
type: object
@@ -2998,11 +2758,170 @@ definitions:
type: string
title_model_id:
type: string
transcription_model_id:
type: string
tts_model_id:
type: string
type: object
tts.ConfigSchema:
properties:
fields:
items:
$ref: '#/definitions/tts.FieldSchema'
type: array
type: object
tts.FieldSchema:
properties:
advanced:
type: boolean
description:
type: string
enum:
items:
type: string
type: array
example: {}
key:
type: string
order:
type: integer
required:
type: boolean
title:
type: string
type:
type: string
type: object
tts.ImportModelsResponse:
properties:
created:
type: integer
models:
items:
type: string
type: array
skipped:
type: integer
type: object
tts.ModelCapabilities:
properties:
config_schema:
$ref: '#/definitions/tts.ConfigSchema'
formats:
items:
type: string
type: array
metadata:
additionalProperties:
type: string
type: object
pitch:
$ref: '#/definitions/tts.ParamConstraint'
speed:
$ref: '#/definitions/tts.ParamConstraint'
voices:
items:
$ref: '#/definitions/tts.VoiceInfo'
type: array
type: object
tts.ModelInfo:
properties:
capabilities:
$ref: '#/definitions/tts.ModelCapabilities'
config_schema:
$ref: '#/definitions/tts.ConfigSchema'
description:
type: string
id:
type: string
name:
type: string
type: object
tts.ParamConstraint:
properties:
default:
type: number
max:
type: number
min:
type: number
options:
items:
type: number
type: array
type: object
tts.ProviderMetaResponse:
properties:
config_schema:
$ref: '#/definitions/tts.ConfigSchema'
default_model:
type: string
description:
type: string
display_name:
type: string
models:
items:
$ref: '#/definitions/tts.ModelInfo'
type: array
provider:
type: string
type: object
tts.SpeechModelResponse:
properties:
config:
additionalProperties: {}
type: object
created_at:
type: string
id:
type: string
model_id:
type: string
name:
type: string
provider_id:
type: string
provider_type:
type: string
updated_at:
type: string
type: object
tts.SpeechProviderResponse:
properties:
client_type:
type: string
config:
additionalProperties: {}
type: object
created_at:
type: string
enable:
type: boolean
icon:
type: string
id:
type: string
name:
type: string
updated_at:
type: string
type: object
tts.TestSynthesizeRequest:
properties:
config:
additionalProperties: {}
type: object
text:
type: string
type: object
tts.VoiceInfo:
properties:
id:
type: string
lang:
type: string
name:
type: string
type: object
info:
contact: {}
title: Memoh API
@@ -8257,7 +8176,7 @@ paths:
description: OK
schema:
items:
$ref: '#/definitions/audio.SpeechModelResponse'
$ref: '#/definitions/tts.SpeechModelResponse'
type: array
"500":
description: Internal Server Error
@@ -8280,7 +8199,7 @@ paths:
"200":
description: OK
schema:
$ref: '#/definitions/audio.SpeechModelResponse'
$ref: '#/definitions/tts.SpeechModelResponse'
"404":
description: Not Found
schema:
@@ -8288,39 +8207,6 @@ paths:
summary: Get a speech model
tags:
- speech-models
put:
consumes:
- application/json
parameters:
- description: Model ID
in: path
name: id
required: true
type: string
- description: Model update payload
in: body
name: request
required: true
schema:
$ref: '#/definitions/audio.UpdateSpeechModelRequest'
produces:
- application/json
responses:
"200":
description: OK
schema:
$ref: '#/definitions/audio.SpeechModelResponse'
"400":
description: Bad Request
schema:
$ref: '#/definitions/handlers.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: Update a speech model
tags:
- speech-models
/speech-models/{id}/capabilities:
get:
parameters:
@@ -8335,7 +8221,7 @@ paths:
"200":
description: OK
schema:
$ref: '#/definitions/audio.ModelCapabilities'
$ref: '#/definitions/tts.ModelCapabilities'
"404":
description: Not Found
schema:
@@ -8359,7 +8245,7 @@ paths:
name: request
required: true
schema:
$ref: '#/definitions/audio.TestSynthesizeRequest'
$ref: '#/definitions/tts.TestSynthesizeRequest'
produces:
- application/octet-stream
responses:
@@ -8389,7 +8275,7 @@ paths:
description: OK
schema:
items:
$ref: '#/definitions/audio.SpeechProviderResponse'
$ref: '#/definitions/tts.SpeechProviderResponse'
type: array
"500":
description: Internal Server Error
@@ -8413,7 +8299,7 @@ paths:
"200":
description: OK
schema:
$ref: '#/definitions/audio.SpeechProviderResponse'
$ref: '#/definitions/tts.SpeechProviderResponse'
"400":
description: Bad Request
schema:
@@ -8443,7 +8329,7 @@ paths:
"200":
description: OK
schema:
$ref: '#/definitions/audio.ImportModelsResponse'
$ref: '#/definitions/tts.ImportModelsResponse'
"400":
description: Bad Request
schema:
@@ -8475,7 +8361,7 @@ paths:
description: OK
schema:
items:
$ref: '#/definitions/audio.SpeechModelResponse'
$ref: '#/definitions/tts.SpeechModelResponse'
type: array
"400":
description: Bad Request
@@ -8496,7 +8382,7 @@ paths:
description: OK
schema:
items:
$ref: '#/definitions/audio.ProviderMetaResponse'
$ref: '#/definitions/tts.ProviderMetaResponse'
type: array
summary: List speech provider metadata
tags:
@@ -8629,267 +8515,6 @@ paths:
summary: List all tags from supermarket
tags:
- supermarket
/transcription-models:
get:
description: List all models of type 'transcription' (filtered view of unified
models table)
produces:
- application/json
responses:
"200":
description: OK
schema:
items:
$ref: '#/definitions/audio.TranscriptionModelResponse'
type: array
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: List all transcription models
tags:
- transcription-models
/transcription-models/{id}:
get:
parameters:
- description: Model ID
in: path
name: id
required: true
type: string
produces:
- application/json
responses:
"200":
description: OK
schema:
$ref: '#/definitions/audio.TranscriptionModelResponse'
"404":
description: Not Found
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: Get a transcription model
tags:
- transcription-models
put:
consumes:
- application/json
parameters:
- description: Model ID
in: path
name: id
required: true
type: string
- description: Model update payload
in: body
name: request
required: true
schema:
$ref: '#/definitions/audio.UpdateSpeechModelRequest'
produces:
- application/json
responses:
"200":
description: OK
schema:
$ref: '#/definitions/audio.TranscriptionModelResponse'
"400":
description: Bad Request
schema:
$ref: '#/definitions/handlers.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: Update a transcription model
tags:
- transcription-models
/transcription-models/{id}/capabilities:
get:
parameters:
- description: Model ID
in: path
name: id
required: true
type: string
produces:
- application/json
responses:
"200":
description: OK
schema:
$ref: '#/definitions/audio.ModelCapabilities'
"404":
description: Not Found
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: Get transcription model capabilities
tags:
- transcription-models
/transcription-models/{id}/test:
post:
consumes:
- multipart/form-data
description: Transcribe uploaded audio using a specific model's config and return
structured text output
parameters:
- description: Model ID
in: path
name: id
required: true
type: string
- description: Audio file
in: formData
name: file
required: true
type: file
- description: Optional JSON config
in: formData
name: config
type: string
produces:
- application/json
responses:
"200":
description: OK
schema:
$ref: '#/definitions/audio.TestTranscriptionResponse'
"400":
description: Bad Request
schema:
$ref: '#/definitions/handlers.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: Test transcription model recognition
tags:
- transcription-models
/transcription-providers:
get:
description: List providers that support transcription (filtered view of unified
providers table)
produces:
- application/json
responses:
"200":
description: OK
schema:
items:
$ref: '#/definitions/audio.SpeechProviderResponse'
type: array
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: List transcription providers
tags:
- transcription-providers
/transcription-providers/{id}:
get:
description: Get a speech provider with masked config values
parameters:
- description: Provider ID (UUID)
in: path
name: id
required: true
type: string
produces:
- application/json
responses:
"200":
description: OK
schema:
$ref: '#/definitions/audio.SpeechProviderResponse'
"400":
description: Bad Request
schema:
$ref: '#/definitions/handlers.ErrorResponse'
"404":
description: Not Found
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: Get speech provider
tags:
- speech-providers
/transcription-providers/{id}/import-models:
post:
consumes:
- application/json
description: Fetch models using the configured transcription provider and import
them into the unified models table
parameters:
- description: Provider ID (UUID)
in: path
name: id
required: true
type: string
produces:
- application/json
responses:
"200":
description: OK
schema:
$ref: '#/definitions/audio.ImportModelsResponse'
"400":
description: Bad Request
schema:
$ref: '#/definitions/handlers.ErrorResponse'
"404":
description: Not Found
schema:
$ref: '#/definitions/handlers.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: Import transcription models from provider
tags:
- transcription-providers
/transcription-providers/{id}/models:
get:
description: List models of type 'transcription' for a specific transcription
provider
parameters:
- description: Provider ID (UUID)
in: path
name: id
required: true
type: string
produces:
- application/json
responses:
"200":
description: OK
schema:
items:
$ref: '#/definitions/audio.TranscriptionModelResponse'
type: array
"400":
description: Bad Request
schema:
$ref: '#/definitions/handlers.ErrorResponse'
"500":
description: Internal Server Error
schema:
$ref: '#/definitions/handlers.ErrorResponse'
summary: List transcription models by provider
tags:
- transcription-providers
/transcription-providers/meta:
get:
description: List available transcription provider types with their models and
capabilities
responses:
"200":
description: OK
schema:
items:
$ref: '#/definitions/audio.ProviderMetaResponse'
type: array
summary: List transcription provider metadata
tags:
- transcription-providers
/users:
get:
description: List users