fix: separate audio domain and restore transcription templates

Move speech and transcription internals into the audio domain, restore template-driven transcription providers, and regenerate Swagger/SDK so the frontend can stop hand-calling /transcription-* APIs.
This commit is contained in:
Acbox
2026-04-21 23:33:36 +08:00
parent f845e936f8
commit 7376bc5adb
43 changed files with 3511 additions and 1116 deletions
+2 -4
View File
@@ -72,8 +72,7 @@ func TestSpawnAndNotify(t *testing.T) {
task := mgr.Get(taskID)
if task == nil {
t.Fatal("task not found after completion")
}
if task.Status != TaskCompleted {
} else if task.Status != TaskCompleted {
t.Errorf("expected task status completed, got %s", task.Status)
}
}
@@ -130,8 +129,7 @@ func TestKillTask(t *testing.T) {
task := mgr.Get(taskID)
if task == nil {
t.Fatal("task not found")
}
if task.Status != TaskKilled {
} else if task.Status != TaskKilled {
t.Errorf("expected status killed, got %s", task.Status)
}
+6 -6
View File
@@ -16,9 +16,9 @@ import (
sdk "github.com/memohai/twilight-ai/sdk"
audiopkg "github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/media"
"github.com/memohai/memoh/internal/settings"
ttspkg "github.com/memohai/memoh/internal/tts"
)
const mediaDataPrefix = "/data/media/"
@@ -26,19 +26,19 @@ const mediaDataPrefix = "/data/media/"
type TranscriptionProvider struct {
logger *slog.Logger
settings *settings.Service
tts *ttspkg.Service
audio *audiopkg.Service
media *media.Service
http *http.Client
}
func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, ttsSvc *ttspkg.Service, mediaSvc *media.Service) *TranscriptionProvider {
func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, mediaSvc *media.Service) *TranscriptionProvider {
if log == nil {
log = slog.Default()
}
return &TranscriptionProvider{
logger: log.With(slog.String("tool", "transcribe_audio")),
settings: settingsSvc,
tts: ttsSvc,
audio: audioSvc,
media: mediaSvc,
http: &http.Client{
Timeout: 30 * time.Second,
@@ -56,7 +56,7 @@ func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, t
}
func (p *TranscriptionProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
if session.IsSubagent || p.settings == nil || p.tts == nil || p.media == nil {
if session.IsSubagent || p.settings == nil || p.audio == nil || p.media == nil {
return nil, nil
}
botID := strings.TrimSpace(session.BotID)
@@ -120,7 +120,7 @@ func (p *TranscriptionProvider) execTranscribe(ctx context.Context, session Sess
if prompt := FirstStringArg(args, "prompt"); prompt != "" {
override["prompt"] = prompt
}
result, err := p.tts.Transcribe(ctx, modelID, audio, filename, contentType, override)
result, err := p.audio.Transcribe(ctx, modelID, audio, filename, contentType, override)
if err != nil {
return nil, err
}
+6 -6
View File
@@ -10,9 +10,9 @@ import (
sdk "github.com/memohai/twilight-ai/sdk"
audiopkg "github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/channel"
"github.com/memohai/memoh/internal/settings"
ttspkg "github.com/memohai/memoh/internal/tts"
)
const ttsMaxTextLen = 500
@@ -30,26 +30,26 @@ type TTSChannelResolver interface {
type TTSProvider struct {
logger *slog.Logger
settings *settings.Service
tts *ttspkg.Service
audio *audiopkg.Service
sender TTSSender
resolver TTSChannelResolver
}
func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, ttsSvc *ttspkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
if log == nil {
log = slog.Default()
}
return &TTSProvider{
logger: log.With(slog.String("tool", "tts")),
settings: settingsSvc,
tts: ttsSvc,
audio: audioSvc,
sender: sender,
resolver: resolver,
}
}
func (p *TTSProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
if session.IsSubagent || p.settings == nil || p.tts == nil || p.sender == nil || p.resolver == nil {
if session.IsSubagent || p.settings == nil || p.audio == nil || p.sender == nil || p.resolver == nil {
return nil, nil
}
botID := strings.TrimSpace(session.BotID)
@@ -115,7 +115,7 @@ func (p *TTSProvider) execSpeak(ctx context.Context, session SessionContext, arg
if botSettings.TtsModelID == "" {
return nil, errors.New("bot has no TTS model configured")
}
audioData, contentType, synthErr := p.tts.Synthesize(ctx, botSettings.TtsModelID, text, nil)
audioData, contentType, synthErr := p.audio.Synthesize(ctx, botSettings.TtsModelID, text, nil)
if synthErr != nil {
return nil, fmt.Errorf("speech synthesis failed: %s", synthErr.Error())
}
@@ -1,4 +1,4 @@
package tts
package audio
import "context"
@@ -6,10 +6,10 @@ import (
"log/slog"
"strings"
"github.com/memohai/memoh/internal/tts"
"github.com/memohai/memoh/internal/audio"
)
const TtsTypeEdge tts.TtsType = "edge"
const TtsTypeEdge audio.TtsType = "edge"
const edgeModelReadAloud = "edge-read-aloud"
@@ -33,12 +33,12 @@ func NewEdgeAdapterWithClient(log *slog.Logger, client *EdgeWsClient) *EdgeAdapt
}
}
func (*EdgeAdapter) Type() tts.TtsType {
func (*EdgeAdapter) Type() audio.TtsType {
return TtsTypeEdge
}
func (*EdgeAdapter) Meta() tts.TtsMeta {
return tts.TtsMeta{
func (*EdgeAdapter) Meta() audio.TtsMeta {
return audio.TtsMeta{
Provider: "Microsoft Edge",
Description: "Microsoft Edge TTS",
}
@@ -54,32 +54,32 @@ var edgeFormats = []string{
"webm-24khz-16bit-mono-opus",
}
var edgeSpeedConstraint = &tts.ParamConstraint{
var edgeSpeedConstraint = &audio.ParamConstraint{
Options: []float64{0.5, 1.0, 2.0, 3.0},
Default: 1.0,
}
var edgePitchConstraint = &tts.ParamConstraint{
var edgePitchConstraint = &audio.ParamConstraint{
Min: -100,
Max: 100,
Default: 0,
}
func (*EdgeAdapter) Models() []tts.ModelInfo {
var voices []tts.VoiceInfo
func (*EdgeAdapter) Models() []audio.ModelInfo {
var voices []audio.VoiceInfo
for lang, ids := range EdgeTTSVoices {
for _, id := range ids {
name := strings.TrimPrefix(id, lang+"-")
name = strings.TrimSuffix(name, "Neural")
voices = append(voices, tts.VoiceInfo{ID: id, Lang: lang, Name: name})
voices = append(voices, audio.VoiceInfo{ID: id, Lang: lang, Name: name})
}
}
return []tts.ModelInfo{
return []audio.ModelInfo{
{
ID: edgeModelReadAloud,
Name: "Edge Read Aloud",
Description: "Built-in Edge Read Aloud speech model",
Capabilities: tts.ModelCapabilities{
Capabilities: audio.ModelCapabilities{
Voices: voices,
Formats: edgeFormats,
Speed: edgeSpeedConstraint,
@@ -100,14 +100,14 @@ func (*EdgeAdapter) ResolveModel(model string) (string, error) {
return edgeModelReadAloud, nil
}
func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config tts.AudioConfig) ([]byte, error) {
func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config audio.AudioConfig) ([]byte, error) {
if err := config.Validate(); err != nil {
return nil, fmt.Errorf("edge tts: invalid config: %w", err)
}
return a.client.Synthesize(ctx, text, config)
}
func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config tts.AudioConfig) (chan []byte, chan error) {
func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config audio.AudioConfig) (chan []byte, chan error) {
if err := config.Validate(); err != nil {
errCh := make(chan error, 1)
errCh <- fmt.Errorf("edge tts: invalid config: %w", err)
@@ -8,7 +8,7 @@ import (
"strings"
"testing"
"github.com/memohai/memoh/internal/tts"
"github.com/memohai/memoh/internal/audio"
)
func TestEdgeAdapter_TypeAndMeta(t *testing.T) {
@@ -37,7 +37,7 @@ func TestEdgeAdapter_Synthesize_WithMockServer(t *testing.T) {
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
ctx := context.Background()
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
audio, err := adapter.Synthesize(ctx, "Hello", edgeModelReadAloud, config)
if err != nil {
t.Fatalf("Synthesize: %v", err)
@@ -61,7 +61,7 @@ func TestEdgeAdapter_Stream_WithMockServer(t *testing.T) {
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
ctx := context.Background()
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
ch, errCh := adapter.Stream(ctx, "Hi", edgeModelReadAloud, config)
var chunks [][]byte
for b := range ch {
@@ -86,7 +86,7 @@ func TestEdgeAdapter_Synthesize_NotConnected(t *testing.T) {
adapter := NewEdgeAdapterWithClient(slog.Default(), client)
ctx := context.Background()
_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, tts.AudioConfig{})
_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, audio.AudioConfig{})
if err == nil {
t.Fatal("expected error when connection fails")
}
@@ -20,7 +20,7 @@ import (
"github.com/google/uuid"
"github.com/gorilla/websocket"
"github.com/memohai/memoh/internal/tts"
"github.com/memohai/memoh/internal/audio"
)
// Edge TTS WebSocket client.
@@ -184,7 +184,7 @@ func (c *EdgeWsClient) sendFrame(path, contentType, body string, extraHeaders ma
}
// Configure sends the speech.config message (output format, etc.).
func (c *EdgeWsClient) Configure(ctx context.Context, config tts.AudioConfig) error {
func (c *EdgeWsClient) Configure(ctx context.Context, config audio.AudioConfig) error {
c.mu.Lock()
defer c.mu.Unlock()
if c.conn == nil {
@@ -207,7 +207,7 @@ func (c *EdgeWsClient) Configure(ctx context.Context, config tts.AudioConfig) er
}
// buildSSML builds SSML with rate and pitch for Edge TTS prosody.
func buildSSML(text string, voice tts.VoiceConfig, speed, pitch float64) string {
func buildSSML(text string, voice audio.VoiceConfig, speed, pitch float64) string {
voiceID := voice.ID
if voiceID == "" {
voiceID = DEFAULT_VOICE
@@ -241,7 +241,7 @@ func escapeSSML(s string) string {
// Synthesize sends SSML and synchronously collects all audio data.
// It handles the full lifecycle: connect → configure → send → receive → close.
func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config tts.AudioConfig) ([]byte, error) {
func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config audio.AudioConfig) ([]byte, error) {
if err := c.Connect(ctx); err != nil {
return nil, err
}
@@ -338,7 +338,7 @@ func parseAudioChunk(data []byte) ([]byte, error) {
// Stream sends SSML and returns audio chunks via channel.
// It handles the full lifecycle: connect → configure → send → stream → close.
func (c *EdgeWsClient) Stream(ctx context.Context, text string, config tts.AudioConfig) (ch chan []byte, errCh chan error) {
func (c *EdgeWsClient) Stream(ctx context.Context, text string, config audio.AudioConfig) (ch chan []byte, errCh chan error) {
ch = make(chan []byte, 8)
errCh = make(chan error, 1)
go func() {
@@ -9,7 +9,7 @@ import (
"testing"
"time"
"github.com/memohai/memoh/internal/tts"
"github.com/memohai/memoh/internal/audio"
)
// Real Edge TTS integration tests. Not compiled by default (requires -tags=integration).
@@ -17,14 +17,14 @@ import (
//
// Run:
//
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS -v
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS -v
func TestRealEdgeTTS_Synthesize(t *testing.T) {
client := NewEdgeWsClient()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
audio, err := client.Synthesize(ctx, "Hello, this is a real Edge TTS test.", config)
if err != nil {
t.Fatalf("Synthesize: %v", err)
@@ -40,7 +40,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
ch, errCh := client.Stream(ctx, "你好,这是流式测试。", config)
var total int
for b := range ch {
@@ -57,7 +57,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {
// TestRealEdgeTTS_Formats tries every candidate format and reports which ones are supported.
//
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_Formats -v
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_Formats -v
func TestRealEdgeTTS_Formats(t *testing.T) {
formats := []string{
"audio-24khz-48kbitrate-mono-mp3",
@@ -71,8 +71,8 @@ func TestRealEdgeTTS_Formats(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
config := tts.AudioConfig{
Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
config := audio.AudioConfig{
Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
Format: fmt,
Speed: 1.0,
}
@@ -88,7 +88,7 @@ func TestRealEdgeTTS_Formats(t *testing.T) {
// TestRealEdgeTTS_SaveAudio synthesizes speech and writes the result to a file for manual inspection.
//
// go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
// go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
func TestRealEdgeTTS_SaveAudio(t *testing.T) {
client := NewEdgeWsClient()
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
@@ -97,11 +97,11 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {
cases := []struct {
name string
text string
voice tts.VoiceConfig
voice audio.VoiceConfig
file string
}{
{"en", "Hello, this is an Edge TTS audio save test.", tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
{"zh", "你好,这是一段中文语音合成测试。", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
{"en", "Hello, this is an Edge TTS audio save test.", audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
{"zh", "你好,这是一段中文语音合成测试。", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
}
outDir := filepath.Join(os.TempDir(), "edge_tts_test")
@@ -111,7 +111,7 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
config := tts.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
config := audio.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
audio, err := client.Synthesize(ctx, tc.text, config)
if err != nil {
t.Fatalf("Synthesize: %v", err)
@@ -11,7 +11,7 @@ import (
"github.com/gorilla/websocket"
"github.com/memohai/memoh/internal/tts"
"github.com/memohai/memoh/internal/audio"
)
var upgrader = websocket.Upgrader{
@@ -95,7 +95,7 @@ func TestEdgeWsClient_ConnectAndSynthesize(t *testing.T) {
client := NewEdgeWsClient()
client.BaseURL = wsURL
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
audio, err := client.Synthesize(t.Context(), "Hello world", config)
if err != nil {
t.Fatalf("Synthesize: %v", err)
@@ -114,7 +114,7 @@ func TestEdgeWsClient_Stream(t *testing.T) {
client := NewEdgeWsClient()
client.BaseURL = wsURL
config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
ch, errCh := client.Stream(t.Context(), "Hi", config)
var chunks [][]byte
for b := range ch {
@@ -197,7 +197,7 @@ func TestParseAudioChunk_EmptyOrShort(t *testing.T) {
func TestBuildSSML(t *testing.T) {
t.Parallel()
ssml := buildSSML("Hello", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
ssml := buildSSML("Hello", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
if !strings.Contains(ssml, "zh-CN-XiaoxiaoNeural") {
t.Errorf("ssml should contain voice: %s", ssml)
}
@@ -1,11 +1,13 @@
package tts
package audio
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgtype"
"github.com/memohai/memoh/internal/db/sqlc"
@@ -14,23 +16,23 @@ import (
func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
for _, def := range registry.List() {
configJSON, err := json.Marshal(map[string]any{})
provider, err := queries.GetProviderByClientType(ctx, string(def.ClientType))
if err != nil {
return fmt.Errorf("marshal speech provider config: %w", err)
}
var icon pgtype.Text
if def.Icon != "" {
icon = pgtype.Text{String: def.Icon, Valid: true}
}
provider, err := queries.UpsertRegistryProvider(ctx, sqlc.UpsertRegistryProviderParams{
Name: def.DisplayName,
ClientType: string(def.ClientType),
Icon: icon,
Config: configJSON,
})
if err != nil {
return fmt.Errorf("upsert speech provider %s: %w", def.ClientType, err)
if errors.Is(err, pgx.ErrNoRows) {
if logger != nil {
logger.Warn("audio registry skipped provider without template",
slog.String("provider", string(def.ClientType)),
slog.String("display_name", def.DisplayName))
}
continue
}
if logger != nil {
logger.Warn("audio registry failed to load provider template",
slog.String("provider", string(def.ClientType)),
slog.String("display_name", def.DisplayName),
slog.Any("error", err))
}
return fmt.Errorf("get provider by client type %s: %w", def.ClientType, err)
}
synced := 0
@@ -1,4 +1,4 @@
package tts
package audio
// VoiceConfig is kept for backward compatibility with the legacy Edge adapter tests.
type VoiceConfig struct {
@@ -1,4 +1,4 @@
package tts
package audio
import (
"fmt"
@@ -1,4 +1,4 @@
package tts
package audio
import (
"context"
@@ -24,7 +24,7 @@ type Service struct {
func NewService(log *slog.Logger, queries *sqlc.Queries, registry *Registry) *Service {
return &Service{
queries: queries,
logger: log.With(slog.String("service", "tts")),
logger: log.With(slog.String("service", "audio")),
registry: registry,
}
}
@@ -1,4 +1,4 @@
package tts
package audio
import (
"fmt"
@@ -13,7 +13,7 @@ import (
const (
defaultTTL = 10 * time.Minute
cleanupInterval = 1 * time.Minute
tempDirName = "tts_temp"
tempDirName = "audio_temp"
)
// TempStore manages temporary audio files on disk with automatic TTL-based cleanup.
@@ -30,7 +30,7 @@ type TempStore struct {
func NewTempStore(baseDir string) (*TempStore, error) {
dir := filepath.Join(baseDir, tempDirName)
if err := os.MkdirAll(dir, 0o750); err != nil {
return nil, fmt.Errorf("create tts temp dir: %w", err)
return nil, fmt.Errorf("create audio temp dir: %w", err)
}
return &TempStore{
dir: dir,
@@ -1,4 +1,4 @@
package tts
package audio
import "time"
+36 -36
View File
@@ -58,14 +58,14 @@ type mediaIngestor interface {
channel.ContainerAttachmentIngester
}
// ttsSynthesizer synthesizes text to speech audio.
type ttsSynthesizer interface {
// speechSynthesizer synthesizes text to speech audio.
type speechSynthesizer interface {
Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
}
// ttsModelResolver looks up the TTS model ID configured for a bot.
type ttsModelResolver interface {
ResolveTtsModelID(ctx context.Context, botID string) (string, error)
// speechModelResolver looks up the speech model ID configured for a bot.
type speechModelResolver interface {
ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
}
// TranscriptionResult is the minimal speech-to-text response shape needed by inbound routing.
@@ -101,29 +101,29 @@ type SessionResult struct {
// ChannelInboundProcessor routes channel inbound messages to the chat gateway.
type ChannelInboundProcessor struct {
runner flow.Runner
routeResolver RouteResolver
message messagepkg.Writer
mediaService mediaIngestor
reactor channelReactor
commandHandler *command.Handler
registry *channel.Registry
logger *slog.Logger
jwtSecret string
tokenTTL time.Duration
identity *IdentityResolver
policy PolicyService
dispatcher *RouteDispatcher
acl chatACL
observer channel.StreamObserver
ttsService ttsSynthesizer
ttsModelResolver ttsModelResolver
transcriber transcriptionRecognizer
sttModelResolver transcriptionModelResolver
sessionEnsurer SessionEnsurer
pipeline *pipelinepkg.Pipeline
eventStore *pipelinepkg.EventStore
discussDriver *pipelinepkg.DiscussDriver
runner flow.Runner
routeResolver RouteResolver
message messagepkg.Writer
mediaService mediaIngestor
reactor channelReactor
commandHandler *command.Handler
registry *channel.Registry
logger *slog.Logger
jwtSecret string
tokenTTL time.Duration
identity *IdentityResolver
policy PolicyService
dispatcher *RouteDispatcher
acl chatACL
observer channel.StreamObserver
speechService speechSynthesizer
speechModelResolver speechModelResolver
transcriber transcriptionRecognizer
sttModelResolver transcriptionModelResolver
sessionEnsurer SessionEnsurer
pipeline *pipelinepkg.Pipeline
eventStore *pipelinepkg.EventStore
discussDriver *pipelinepkg.DiscussDriver
// activeStreams maps "botID:routeID" to a context.CancelFunc for the
// currently running agent stream. Used by /stop to abort generation
@@ -205,14 +205,14 @@ func (p *ChannelInboundProcessor) SetStreamObserver(observer channel.StreamObser
p.observer = observer
}
// SetTtsService configures the TTS synthesizer and settings reader for handling
// <speech> tag events (speech_delta) that require server-side audio synthesis.
func (p *ChannelInboundProcessor) SetTtsService(synth ttsSynthesizer, modelResolver ttsModelResolver) {
// SetSpeechService configures the speech synthesizer and settings reader for
// handling <speech> tag events (speech_delta) that require server-side audio synthesis.
func (p *ChannelInboundProcessor) SetSpeechService(synth speechSynthesizer, modelResolver speechModelResolver) {
if p == nil {
return
}
p.ttsService = synth
p.ttsModelResolver = modelResolver
p.speechService = synth
p.speechModelResolver = modelResolver
}
// SetTranscriptionService configures speech-to-text processing for inbound audio attachments.
@@ -2304,13 +2304,13 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
outboundAssetRefs *[]conversation.OutboundAssetRef,
assetMu *sync.Mutex,
) {
if p.ttsService == nil || p.ttsModelResolver == nil {
if p.speechService == nil || p.speechModelResolver == nil {
if p.logger != nil {
p.logger.Warn("speech_delta received but TTS service not configured")
}
return
}
modelID, err := p.ttsModelResolver.ResolveTtsModelID(ctx, botID)
modelID, err := p.speechModelResolver.ResolveSpeechModelID(ctx, botID)
if err != nil || strings.TrimSpace(modelID) == "" {
if p.logger != nil {
p.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
@@ -2322,7 +2322,7 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
if text == "" {
continue
}
audioData, contentType, synthErr := p.ttsService.Synthesize(ctx, modelID, text, nil)
audioData, contentType, synthErr := p.speechService.Synthesize(ctx, modelID, text, nil)
if synthErr != nil {
if p.logger != nil {
p.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
+21
View File
@@ -318,6 +318,27 @@ func (q *Queries) GetModelByProviderAndModelID(ctx context.Context, arg GetModel
return i, err
}
const getProviderByClientType = `-- name: GetProviderByClientType :one
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE client_type = $1
`
func (q *Queries) GetProviderByClientType(ctx context.Context, clientType string) (Provider, error) {
row := q.db.QueryRow(ctx, getProviderByClientType, clientType)
var i Provider
err := row.Scan(
&i.ID,
&i.Name,
&i.ClientType,
&i.Icon,
&i.Enable,
&i.Config,
&i.Metadata,
&i.CreatedAt,
&i.UpdatedAt,
)
return i, err
}
const getProviderByID = `-- name: GetProviderByID :one
SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE id = $1
`
+13 -13
View File
@@ -7,28 +7,28 @@ import (
"github.com/labstack/echo/v4"
audiopkg "github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/settings"
"github.com/memohai/memoh/internal/tts"
)
// BotTtsHandler handles per-bot TTS synthesis requests from the agent tool.
type BotTtsHandler struct {
ttsService *tts.Service
// BotAudioHandler handles per-bot speech synthesis requests from the agent tool.
type BotAudioHandler struct {
audioService *audiopkg.Service
settingsService *settings.Service
tempStore *tts.TempStore
tempStore *audiopkg.TempStore
logger *slog.Logger
}
func NewBotTtsHandler(log *slog.Logger, ttsService *tts.Service, settingsService *settings.Service, tempStore *tts.TempStore) *BotTtsHandler {
return &BotTtsHandler{
ttsService: ttsService,
func NewBotAudioHandler(log *slog.Logger, audioService *audiopkg.Service, settingsService *settings.Service, tempStore *audiopkg.TempStore) *BotAudioHandler {
return &BotAudioHandler{
audioService: audioService,
settingsService: settingsService,
tempStore: tempStore,
logger: log.With(slog.String("handler", "bot_tts")),
logger: log.With(slog.String("handler", "bot_audio")),
}
}
func (h *BotTtsHandler) Register(e *echo.Echo) {
func (h *BotAudioHandler) Register(e *echo.Echo) {
e.POST("/bots/:bot_id/tts/synthesize", h.Synthesize)
}
@@ -54,7 +54,7 @@ type synthesizeResponse struct {
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /bots/{bot_id}/tts/synthesize [post].
func (h *BotTtsHandler) Synthesize(c echo.Context) error {
func (h *BotAudioHandler) Synthesize(c echo.Context) error {
botID := strings.TrimSpace(c.Param("bot_id"))
if botID == "" {
return echo.NewHTTPError(http.StatusBadRequest, "bot_id is required")
@@ -88,10 +88,10 @@ func (h *BotTtsHandler) Synthesize(c echo.Context) error {
return echo.NewHTTPError(http.StatusInternalServerError, "failed to create temp file")
}
contentType, streamErr := h.ttsService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
contentType, streamErr := h.audioService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
closeErr := f.Close()
if streamErr != nil {
h.logger.Error("tts synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
h.logger.Error("speech synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
h.tempStore.Delete(tempID)
return echo.NewHTTPError(http.StatusInternalServerError, streamErr.Error())
}
+24 -24
View File
@@ -30,30 +30,30 @@ import (
messagepkg "github.com/memohai/memoh/internal/message"
)
// localTtsSynthesizer synthesizes text to speech audio.
type localTtsSynthesizer interface {
// localSpeechSynthesizer synthesizes text to speech audio.
type localSpeechSynthesizer interface {
Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
}
// localTtsModelResolver resolves TTS model IDs for bots.
type localTtsModelResolver interface {
ResolveTtsModelID(ctx context.Context, botID string) (string, error)
// localSpeechModelResolver resolves speech model IDs for bots.
type localSpeechModelResolver interface {
ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
}
// LocalChannelHandler handles local channel routes (WebUI / API) backed by bot history.
type LocalChannelHandler struct {
channelType channel.ChannelType
channelManager *channel.Manager
channelStore *channel.Store
chatService *conversation.Service
routeHub *local.RouteHub
botService *bots.Service
accountService *accounts.Service
resolver *flow.Resolver
mediaService *media.Service
ttsService localTtsSynthesizer
ttsModelResolver localTtsModelResolver
logger *slog.Logger
channelType channel.ChannelType
channelManager *channel.Manager
channelStore *channel.Store
chatService *conversation.Service
routeHub *local.RouteHub
botService *bots.Service
accountService *accounts.Service
resolver *flow.Resolver
mediaService *media.Service
speechService localSpeechSynthesizer
speechModelResolver localSpeechModelResolver
logger *slog.Logger
}
// NewLocalChannelHandler creates a local channel handler.
@@ -80,10 +80,10 @@ func (h *LocalChannelHandler) SetMediaService(svc *media.Service) {
h.mediaService = svc
}
// SetTtsService configures TTS synthesis for handling speech_delta events.
func (h *LocalChannelHandler) SetTtsService(synth localTtsSynthesizer, resolver localTtsModelResolver) {
h.ttsService = synth
h.ttsModelResolver = resolver
// SetSpeechService configures speech synthesis for handling speech_delta events.
func (h *LocalChannelHandler) SetSpeechService(synth localSpeechSynthesizer, resolver localSpeechModelResolver) {
h.speechService = synth
h.speechModelResolver = resolver
}
// Register registers the local channel routes.
@@ -719,12 +719,12 @@ func (h *LocalChannelHandler) ingestSingleAttachment(ctx context.Context, botID,
// wsSynthesizeSpeech handles speech_delta events by synthesizing audio and
// injecting attachment_delta events with the resulting voice attachments.
func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID string, original json.RawMessage) []json.RawMessage {
if h.ttsService == nil || h.ttsModelResolver == nil {
if h.speechService == nil || h.speechModelResolver == nil {
h.logger.Warn("speech_delta received but TTS service not configured")
return nil
}
modelID, err := h.ttsModelResolver.ResolveTtsModelID(ctx, botID)
modelID, err := h.speechModelResolver.ResolveSpeechModelID(ctx, botID)
if err != nil || strings.TrimSpace(modelID) == "" {
h.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
return nil
@@ -746,7 +746,7 @@ func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID stri
continue
}
audioData, contentType, synthErr := h.ttsService.Synthesize(ctx, modelID, text, nil)
audioData, contentType, synthErr := h.speechService.Synthesize(ctx, modelID, text, nil)
if synthErr != nil {
h.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
continue
+139 -43
View File
@@ -12,25 +12,25 @@ import (
"github.com/labstack/echo/v4"
audiopkg "github.com/memohai/memoh/internal/audio"
"github.com/memohai/memoh/internal/models"
"github.com/memohai/memoh/internal/tts"
)
type SpeechHandler struct {
service *tts.Service
type AudioHandler struct {
service *audiopkg.Service
modelsService *models.Service
logger *slog.Logger
}
func NewSpeechHandler(log *slog.Logger, service *tts.Service, modelsService *models.Service) *SpeechHandler {
return &SpeechHandler{
func NewAudioHandler(log *slog.Logger, service *audiopkg.Service, modelsService *models.Service) *AudioHandler {
return &AudioHandler{
service: service,
modelsService: modelsService,
logger: log.With(slog.String("handler", "speech")),
logger: log.With(slog.String("handler", "audio")),
}
}
func (h *SpeechHandler) Register(e *echo.Echo) {
func (h *AudioHandler) Register(e *echo.Echo) {
pg := e.Group("/speech-providers")
pg.GET("", h.ListProviders)
pg.GET("/:id", h.GetProvider)
@@ -64,13 +64,19 @@ func (h *SpeechHandler) Register(e *echo.Echo) {
// @Summary List speech provider metadata
// @Description List available speech provider types with their models and capabilities
// @Tags speech-providers
// @Success 200 {array} tts.ProviderMetaResponse
// @Success 200 {array} audiopkg.ProviderMetaResponse
// @Router /speech-providers/meta [get].
func (h *SpeechHandler) ListSpeechMeta(c echo.Context) error {
func (h *AudioHandler) ListSpeechMeta(c echo.Context) error {
return c.JSON(http.StatusOK, h.service.ListSpeechMeta(c.Request().Context()))
}
func (h *SpeechHandler) ListTranscriptionMeta(c echo.Context) error {
// ListTranscriptionMeta godoc
// @Summary List transcription provider metadata
// @Description List available transcription provider types with their models and capabilities
// @Tags transcription-providers
// @Success 200 {array} audiopkg.ProviderMetaResponse
// @Router /transcription-providers/meta [get].
func (h *AudioHandler) ListTranscriptionMeta(c echo.Context) error {
return c.JSON(http.StatusOK, h.service.ListTranscriptionMeta(c.Request().Context()))
}
@@ -79,10 +85,10 @@ func (h *SpeechHandler) ListTranscriptionMeta(c echo.Context) error {
// @Description List providers that support speech (filtered view of unified providers table)
// @Tags speech-providers
// @Produce json
// @Success 200 {array} tts.SpeechProviderResponse
// @Success 200 {array} audiopkg.SpeechProviderResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-providers [get].
func (h *SpeechHandler) ListProviders(c echo.Context) error {
func (h *AudioHandler) ListProviders(c echo.Context) error {
items, err := h.service.ListSpeechProviders(c.Request().Context())
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -90,7 +96,15 @@ func (h *SpeechHandler) ListProviders(c echo.Context) error {
return c.JSON(http.StatusOK, items)
}
func (h *SpeechHandler) ListTranscriptionProviders(c echo.Context) error {
// ListTranscriptionProviders godoc
// @Summary List transcription providers
// @Description List providers that support transcription (filtered view of unified providers table)
// @Tags transcription-providers
// @Produce json
// @Success 200 {array} audiopkg.SpeechProviderResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-providers [get].
func (h *AudioHandler) ListTranscriptionProviders(c echo.Context) error {
items, err := h.service.ListTranscriptionProviders(c.Request().Context())
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -104,11 +118,12 @@ func (h *SpeechHandler) ListTranscriptionProviders(c echo.Context) error {
// @Tags speech-providers
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {object} tts.SpeechProviderResponse
// @Success 200 {object} audiopkg.SpeechProviderResponse
// @Failure 400 {object} ErrorResponse
// @Failure 404 {object} ErrorResponse
// @Router /speech-providers/{id} [get].
func (h *SpeechHandler) GetProvider(c echo.Context) error {
// @Router /transcription-providers/{id} [get].
func (h *AudioHandler) GetProvider(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -126,11 +141,11 @@ func (h *SpeechHandler) GetProvider(c echo.Context) error {
// @Tags speech-providers
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {array} tts.SpeechModelResponse
// @Success 200 {array} audiopkg.SpeechModelResponse
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-providers/{id}/models [get].
func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
func (h *AudioHandler) ListModelsByProvider(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -149,12 +164,12 @@ func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
// @Accept json
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {object} tts.ImportModelsResponse
// @Success 200 {object} audiopkg.ImportModelsResponse
// @Failure 400 {object} ErrorResponse
// @Failure 404 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-providers/{id}/import-models [post].
func (h *SpeechHandler) ImportModels(c echo.Context) error {
func (h *AudioHandler) ImportModels(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -165,7 +180,7 @@ func (h *SpeechHandler) ImportModels(c echo.Context) error {
return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote speech models: %v", err))
}
resp := tts.ImportModelsResponse{
resp := audiopkg.ImportModelsResponse{
Models: make([]string, 0, len(remoteModels)),
}
@@ -197,7 +212,17 @@ func (h *SpeechHandler) ImportModels(c echo.Context) error {
return c.JSON(http.StatusOK, resp)
}
func (h *SpeechHandler) ListTranscriptionModelsByProvider(c echo.Context) error {
// ListTranscriptionModelsByProvider godoc
// @Summary List transcription models by provider
// @Description List models of type 'transcription' for a specific transcription provider
// @Tags transcription-providers
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {array} audiopkg.TranscriptionModelResponse
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-providers/{id}/models [get].
func (h *AudioHandler) ListTranscriptionModelsByProvider(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -209,7 +234,19 @@ func (h *SpeechHandler) ListTranscriptionModelsByProvider(c echo.Context) error
return c.JSON(http.StatusOK, items)
}
func (h *SpeechHandler) ImportTranscriptionModels(c echo.Context) error {
// ImportTranscriptionModels godoc
// @Summary Import transcription models from provider
// @Description Fetch models using the configured transcription provider and import them into the unified models table
// @Tags transcription-providers
// @Accept json
// @Produce json
// @Param id path string true "Provider ID (UUID)"
// @Success 200 {object} audiopkg.ImportModelsResponse
// @Failure 400 {object} ErrorResponse
// @Failure 404 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-providers/{id}/import-models [post].
func (h *AudioHandler) ImportTranscriptionModels(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -220,7 +257,7 @@ func (h *SpeechHandler) ImportTranscriptionModels(c echo.Context) error {
return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote transcription models: %v", err))
}
resp := tts.ImportModelsResponse{
resp := audiopkg.ImportModelsResponse{
Models: make([]string, 0, len(remoteModels)),
}
@@ -257,10 +294,10 @@ func (h *SpeechHandler) ImportTranscriptionModels(c echo.Context) error {
// @Description List all models of type 'speech' (filtered view of unified models table)
// @Tags speech-models
// @Produce json
// @Success 200 {array} tts.SpeechModelResponse
// @Success 200 {array} audiopkg.SpeechModelResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-models [get].
func (h *SpeechHandler) ListModels(c echo.Context) error {
func (h *AudioHandler) ListModels(c echo.Context) error {
items, err := h.service.ListSpeechModels(c.Request().Context())
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -268,7 +305,15 @@ func (h *SpeechHandler) ListModels(c echo.Context) error {
return c.JSON(http.StatusOK, items)
}
func (h *SpeechHandler) ListTranscriptionModels(c echo.Context) error {
// ListTranscriptionModels godoc
// @Summary List all transcription models
// @Description List all models of type 'transcription' (filtered view of unified models table)
// @Tags transcription-models
// @Produce json
// @Success 200 {array} audiopkg.TranscriptionModelResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-models [get].
func (h *AudioHandler) ListTranscriptionModels(c echo.Context) error {
items, err := h.service.ListTranscriptionModels(c.Request().Context())
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -281,10 +326,10 @@ func (h *SpeechHandler) ListTranscriptionModels(c echo.Context) error {
// @Tags speech-models
// @Produce json
// @Param id path string true "Model ID"
// @Success 200 {object} tts.SpeechModelResponse
// @Success 200 {object} audiopkg.SpeechModelResponse
// @Failure 404 {object} ErrorResponse
// @Router /speech-models/{id} [get].
func (h *SpeechHandler) GetModel(c echo.Context) error {
func (h *AudioHandler) GetModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -296,12 +341,23 @@ func (h *SpeechHandler) GetModel(c echo.Context) error {
return c.JSON(http.StatusOK, resp)
}
func (h *SpeechHandler) UpdateModel(c echo.Context) error {
// UpdateModel godoc
// @Summary Update a speech model
// @Tags speech-models
// @Accept json
// @Produce json
// @Param id path string true "Model ID"
// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
// @Success 200 {object} audiopkg.SpeechModelResponse
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-models/{id} [put].
func (h *AudioHandler) UpdateModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
var req tts.UpdateSpeechModelRequest
var req audiopkg.UpdateSpeechModelRequest
if err := c.Bind(&req); err != nil {
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
}
@@ -312,7 +368,15 @@ func (h *SpeechHandler) UpdateModel(c echo.Context) error {
return c.JSON(http.StatusOK, resp)
}
func (h *SpeechHandler) GetTranscriptionModel(c echo.Context) error {
// GetTranscriptionModel godoc
// @Summary Get a transcription model
// @Tags transcription-models
// @Produce json
// @Param id path string true "Model ID"
// @Success 200 {object} audiopkg.TranscriptionModelResponse
// @Failure 404 {object} ErrorResponse
// @Router /transcription-models/{id} [get].
func (h *AudioHandler) GetTranscriptionModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -324,12 +388,23 @@ func (h *SpeechHandler) GetTranscriptionModel(c echo.Context) error {
return c.JSON(http.StatusOK, resp)
}
func (h *SpeechHandler) UpdateTranscriptionModel(c echo.Context) error {
// UpdateTranscriptionModel godoc
// @Summary Update a transcription model
// @Tags transcription-models
// @Accept json
// @Produce json
// @Param id path string true "Model ID"
// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
// @Success 200 {object} audiopkg.TranscriptionModelResponse
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-models/{id} [put].
func (h *AudioHandler) UpdateTranscriptionModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
var req tts.UpdateSpeechModelRequest
var req audiopkg.UpdateSpeechModelRequest
if err := c.Bind(&req); err != nil {
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
}
@@ -345,10 +420,10 @@ func (h *SpeechHandler) UpdateTranscriptionModel(c echo.Context) error {
// @Tags speech-models
// @Produce json
// @Param id path string true "Model ID"
// @Success 200 {object} tts.ModelCapabilities
// @Success 200 {object} audiopkg.ModelCapabilities
// @Failure 404 {object} ErrorResponse
// @Router /speech-models/{id}/capabilities [get].
func (h *SpeechHandler) GetModelCapabilities(c echo.Context) error {
func (h *AudioHandler) GetModelCapabilities(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -360,7 +435,15 @@ func (h *SpeechHandler) GetModelCapabilities(c echo.Context) error {
return c.JSON(http.StatusOK, caps)
}
func (h *SpeechHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
// GetTranscriptionModelCapabilities godoc
// @Summary Get transcription model capabilities
// @Tags transcription-models
// @Produce json
// @Param id path string true "Model ID"
// @Success 200 {object} audiopkg.ModelCapabilities
// @Failure 404 {object} ErrorResponse
// @Router /transcription-models/{id}/capabilities [get].
func (h *AudioHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -379,17 +462,17 @@ func (h *SpeechHandler) GetTranscriptionModelCapabilities(c echo.Context) error
// @Accept json
// @Produce application/octet-stream
// @Param id path string true "Model ID"
// @Param request body tts.TestSynthesizeRequest true "Text to synthesize"
// @Param request body audiopkg.TestSynthesizeRequest true "Text to synthesize"
// @Success 200 {file} binary "Audio data"
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /speech-models/{id}/test [post].
func (h *SpeechHandler) TestModel(c echo.Context) error {
func (h *AudioHandler) TestModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
}
var req tts.TestSynthesizeRequest
var req audiopkg.TestSynthesizeRequest
if err := c.Bind(&req); err != nil {
return echo.NewHTTPError(http.StatusBadRequest, err.Error())
}
@@ -408,7 +491,20 @@ func (h *SpeechHandler) TestModel(c echo.Context) error {
return c.Blob(http.StatusOK, contentType, audio)
}
func (h *SpeechHandler) TestTranscriptionModel(c echo.Context) error {
// TestTranscriptionModel godoc
// @Summary Test transcription model recognition
// @Description Transcribe uploaded audio using a specific model's config and return structured text output
// @Tags transcription-models
// @Accept mpfd
// @Produce json
// @Param id path string true "Model ID"
// @Param file formData file true "Audio file"
// @Param config formData string false "Optional JSON config"
// @Success 200 {object} audiopkg.TestTranscriptionResponse
// @Failure 400 {object} ErrorResponse
// @Failure 500 {object} ErrorResponse
// @Router /transcription-models/{id}/test [post].
func (h *AudioHandler) TestTranscriptionModel(c echo.Context) error {
id := strings.TrimSpace(c.Param("id"))
if id == "" {
return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -441,16 +537,16 @@ func (h *SpeechHandler) TestTranscriptionModel(c echo.Context) error {
if err != nil {
return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
}
resp := tts.TestTranscriptionResponse{
resp := audiopkg.TestTranscriptionResponse{
Text: result.Text,
Language: result.Language,
DurationSeconds: result.DurationSeconds,
Metadata: result.ProviderMetadata,
}
if len(result.Words) > 0 {
resp.Words = make([]tts.TranscriptionWord, 0, len(result.Words))
resp.Words = make([]audiopkg.TranscriptionWord, 0, len(result.Words))
for _, word := range result.Words {
resp.Words = append(resp.Words, tts.TranscriptionWord{
resp.Words = append(resp.Words, audiopkg.TranscriptionWord{
Text: word.Text,
Start: word.Start,
End: word.End,