fix: separate audio domain and restore transcription templates

Move speech and transcription internals into the audio domain, restore template-driven transcription providers, and regenerate Swagger/SDK so the frontend can stop hand-calling /transcription-* APIs.
2026-04-27 07:16:19 +09:00 · 2026-04-21 23:33:36 +08:00
parent f845e936f8
commit 7376bc5adb
43 changed files with 3511 additions and 1116 deletions
@@ -72,8 +72,7 @@ func TestSpawnAndNotify(t *testing.T) {
 	task := mgr.Get(taskID)
 	if task == nil {
 		t.Fatal("task not found after completion")
-	}
-	if task.Status != TaskCompleted {
+	} else if task.Status != TaskCompleted {
 		t.Errorf("expected task status completed, got %s", task.Status)
 	}
 }
@@ -130,8 +129,7 @@ func TestKillTask(t *testing.T) {
 	task := mgr.Get(taskID)
 	if task == nil {
 		t.Fatal("task not found")
-	}
-	if task.Status != TaskKilled {
+	} else if task.Status != TaskKilled {
 		t.Errorf("expected status killed, got %s", task.Status)
 	}

@@ -16,9 +16,9 @@ import (

 	sdk "github.com/memohai/twilight-ai/sdk"

+	audiopkg "github.com/memohai/memoh/internal/audio"
 	"github.com/memohai/memoh/internal/media"
 	"github.com/memohai/memoh/internal/settings"
-	ttspkg "github.com/memohai/memoh/internal/tts"
 )

 const mediaDataPrefix = "/data/media/"
@@ -26,19 +26,19 @@ const mediaDataPrefix = "/data/media/"
 type TranscriptionProvider struct {
 	logger   *slog.Logger
 	settings *settings.Service
-	tts      *ttspkg.Service
+	audio    *audiopkg.Service
 	media    *media.Service
 	http     *http.Client
 }

-func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, ttsSvc *ttspkg.Service, mediaSvc *media.Service) *TranscriptionProvider {
+func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, mediaSvc *media.Service) *TranscriptionProvider {
 	if log == nil {
 		log = slog.Default()
 	}
 	return &TranscriptionProvider{
 		logger:   log.With(slog.String("tool", "transcribe_audio")),
 		settings: settingsSvc,
-		tts:      ttsSvc,
+		audio:    audioSvc,
 		media:    mediaSvc,
 		http: &http.Client{
 			Timeout: 30 * time.Second,
@@ -56,7 +56,7 @@ func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, t
 }

 func (p *TranscriptionProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
-	if session.IsSubagent || p.settings == nil || p.tts == nil || p.media == nil {
+	if session.IsSubagent || p.settings == nil || p.audio == nil || p.media == nil {
 		return nil, nil
 	}
 	botID := strings.TrimSpace(session.BotID)
@@ -120,7 +120,7 @@ func (p *TranscriptionProvider) execTranscribe(ctx context.Context, session Sess
 	if prompt := FirstStringArg(args, "prompt"); prompt != "" {
 		override["prompt"] = prompt
 	}
-	result, err := p.tts.Transcribe(ctx, modelID, audio, filename, contentType, override)
+	result, err := p.audio.Transcribe(ctx, modelID, audio, filename, contentType, override)
 	if err != nil {
 		return nil, err
 	}
@@ -10,9 +10,9 @@ import (

 	sdk "github.com/memohai/twilight-ai/sdk"

+	audiopkg "github.com/memohai/memoh/internal/audio"
 	"github.com/memohai/memoh/internal/channel"
 	"github.com/memohai/memoh/internal/settings"
-	ttspkg "github.com/memohai/memoh/internal/tts"
 )

 const ttsMaxTextLen = 500
@@ -30,26 +30,26 @@ type TTSChannelResolver interface {
 type TTSProvider struct {
 	logger   *slog.Logger
 	settings *settings.Service
-	tts      *ttspkg.Service
+	audio    *audiopkg.Service
 	sender   TTSSender
 	resolver TTSChannelResolver
 }

-func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, ttsSvc *ttspkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
+func NewTTSProvider(log *slog.Logger, settingsSvc *settings.Service, audioSvc *audiopkg.Service, sender TTSSender, resolver TTSChannelResolver) *TTSProvider {
 	if log == nil {
 		log = slog.Default()
 	}
 	return &TTSProvider{
 		logger:   log.With(slog.String("tool", "tts")),
 		settings: settingsSvc,
-		tts:      ttsSvc,
+		audio:    audioSvc,
 		sender:   sender,
 		resolver: resolver,
 	}
 }

 func (p *TTSProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
-	if session.IsSubagent || p.settings == nil || p.tts == nil || p.sender == nil || p.resolver == nil {
+	if session.IsSubagent || p.settings == nil || p.audio == nil || p.sender == nil || p.resolver == nil {
 		return nil, nil
 	}
 	botID := strings.TrimSpace(session.BotID)
@@ -115,7 +115,7 @@ func (p *TTSProvider) execSpeak(ctx context.Context, session SessionContext, arg
 	if botSettings.TtsModelID == "" {
 		return nil, errors.New("bot has no TTS model configured")
 	}
-	audioData, contentType, synthErr := p.tts.Synthesize(ctx, botSettings.TtsModelID, text, nil)
+	audioData, contentType, synthErr := p.audio.Synthesize(ctx, botSettings.TtsModelID, text, nil)
 	if synthErr != nil {
 		return nil, fmt.Errorf("speech synthesis failed: %s", synthErr.Error())
 	}
@@ -1,4 +1,4 @@
-package tts
+package audio

 import "context"

@@ -6,10 +6,10 @@ import (
 	"log/slog"
 	"strings"

-	"github.com/memohai/memoh/internal/tts"
+	"github.com/memohai/memoh/internal/audio"
 )

-const TtsTypeEdge tts.TtsType = "edge"
+const TtsTypeEdge audio.TtsType = "edge"

 const edgeModelReadAloud = "edge-read-aloud"

@@ -33,12 +33,12 @@ func NewEdgeAdapterWithClient(log *slog.Logger, client *EdgeWsClient) *EdgeAdapt
 	}
 }

-func (*EdgeAdapter) Type() tts.TtsType {
+func (*EdgeAdapter) Type() audio.TtsType {
 	return TtsTypeEdge
 }

-func (*EdgeAdapter) Meta() tts.TtsMeta {
-	return tts.TtsMeta{
+func (*EdgeAdapter) Meta() audio.TtsMeta {
+	return audio.TtsMeta{
 		Provider:    "Microsoft Edge",
 		Description: "Microsoft Edge TTS",
 	}
@@ -54,32 +54,32 @@ var edgeFormats = []string{
 	"webm-24khz-16bit-mono-opus",
 }

-var edgeSpeedConstraint = &tts.ParamConstraint{
+var edgeSpeedConstraint = &audio.ParamConstraint{
 	Options: []float64{0.5, 1.0, 2.0, 3.0},
 	Default: 1.0,
 }

-var edgePitchConstraint = &tts.ParamConstraint{
+var edgePitchConstraint = &audio.ParamConstraint{
 	Min:     -100,
 	Max:     100,
 	Default: 0,
 }

-func (*EdgeAdapter) Models() []tts.ModelInfo {
-	var voices []tts.VoiceInfo
+func (*EdgeAdapter) Models() []audio.ModelInfo {
+	var voices []audio.VoiceInfo
 	for lang, ids := range EdgeTTSVoices {
 		for _, id := range ids {
 			name := strings.TrimPrefix(id, lang+"-")
 			name = strings.TrimSuffix(name, "Neural")
-			voices = append(voices, tts.VoiceInfo{ID: id, Lang: lang, Name: name})
+			voices = append(voices, audio.VoiceInfo{ID: id, Lang: lang, Name: name})
 		}
 	}
-	return []tts.ModelInfo{
+	return []audio.ModelInfo{
 		{
 			ID:          edgeModelReadAloud,
 			Name:        "Edge Read Aloud",
 			Description: "Built-in Edge Read Aloud speech model",
-			Capabilities: tts.ModelCapabilities{
+			Capabilities: audio.ModelCapabilities{
 				Voices:  voices,
 				Formats: edgeFormats,
 				Speed:   edgeSpeedConstraint,
@@ -100,14 +100,14 @@ func (*EdgeAdapter) ResolveModel(model string) (string, error) {
 	return edgeModelReadAloud, nil
 }

-func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config tts.AudioConfig) ([]byte, error) {
+func (a *EdgeAdapter) Synthesize(ctx context.Context, text string, _ string, config audio.AudioConfig) ([]byte, error) {
 	if err := config.Validate(); err != nil {
 		return nil, fmt.Errorf("edge tts: invalid config: %w", err)
 	}
 	return a.client.Synthesize(ctx, text, config)
 }

-func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config tts.AudioConfig) (chan []byte, chan error) {
+func (a *EdgeAdapter) Stream(ctx context.Context, text string, _ string, config audio.AudioConfig) (chan []byte, chan error) {
 	if err := config.Validate(); err != nil {
 		errCh := make(chan error, 1)
 		errCh <- fmt.Errorf("edge tts: invalid config: %w", err)
@@ -8,7 +8,7 @@ import (
 	"strings"
 	"testing"

-	"github.com/memohai/memoh/internal/tts"
+	"github.com/memohai/memoh/internal/audio"
 )

 func TestEdgeAdapter_TypeAndMeta(t *testing.T) {
@@ -37,7 +37,7 @@ func TestEdgeAdapter_Synthesize_WithMockServer(t *testing.T) {
 	adapter := NewEdgeAdapterWithClient(slog.Default(), client)

 	ctx := context.Background()
-	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
+	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
 	audio, err := adapter.Synthesize(ctx, "Hello", edgeModelReadAloud, config)
 	if err != nil {
 		t.Fatalf("Synthesize: %v", err)
@@ -61,7 +61,7 @@ func TestEdgeAdapter_Stream_WithMockServer(t *testing.T) {
 	adapter := NewEdgeAdapterWithClient(slog.Default(), client)

 	ctx := context.Background()
-	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
+	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
 	ch, errCh := adapter.Stream(ctx, "Hi", edgeModelReadAloud, config)
 	var chunks [][]byte
 	for b := range ch {
@@ -86,7 +86,7 @@ func TestEdgeAdapter_Synthesize_NotConnected(t *testing.T) {
 	adapter := NewEdgeAdapterWithClient(slog.Default(), client)

 	ctx := context.Background()
-	_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, tts.AudioConfig{})
+	_, err := adapter.Synthesize(ctx, "x", edgeModelReadAloud, audio.AudioConfig{})
 	if err == nil {
 		t.Fatal("expected error when connection fails")
 	}
@@ -20,7 +20,7 @@ import (
 	"github.com/google/uuid"
 	"github.com/gorilla/websocket"

-	"github.com/memohai/memoh/internal/tts"
+	"github.com/memohai/memoh/internal/audio"
 )

 // Edge TTS WebSocket client.
@@ -184,7 +184,7 @@ func (c *EdgeWsClient) sendFrame(path, contentType, body string, extraHeaders ma
 }

 // Configure sends the speech.config message (output format, etc.).
-func (c *EdgeWsClient) Configure(ctx context.Context, config tts.AudioConfig) error {
+func (c *EdgeWsClient) Configure(ctx context.Context, config audio.AudioConfig) error {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 	if c.conn == nil {
@@ -207,7 +207,7 @@ func (c *EdgeWsClient) Configure(ctx context.Context, config tts.AudioConfig) er
 }

 // buildSSML builds SSML with rate and pitch for Edge TTS prosody.
-func buildSSML(text string, voice tts.VoiceConfig, speed, pitch float64) string {
+func buildSSML(text string, voice audio.VoiceConfig, speed, pitch float64) string {
 	voiceID := voice.ID
 	if voiceID == "" {
 		voiceID = DEFAULT_VOICE
@@ -241,7 +241,7 @@ func escapeSSML(s string) string {

 // Synthesize sends SSML and synchronously collects all audio data.
 // It handles the full lifecycle: connect → configure → send → receive → close.
-func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config tts.AudioConfig) ([]byte, error) {
+func (c *EdgeWsClient) Synthesize(ctx context.Context, text string, config audio.AudioConfig) ([]byte, error) {
 	if err := c.Connect(ctx); err != nil {
 		return nil, err
 	}
@@ -338,7 +338,7 @@ func parseAudioChunk(data []byte) ([]byte, error) {

 // Stream sends SSML and returns audio chunks via channel.
 // It handles the full lifecycle: connect → configure → send → stream → close.
-func (c *EdgeWsClient) Stream(ctx context.Context, text string, config tts.AudioConfig) (ch chan []byte, errCh chan error) {
+func (c *EdgeWsClient) Stream(ctx context.Context, text string, config audio.AudioConfig) (ch chan []byte, errCh chan error) {
 	ch = make(chan []byte, 8)
 	errCh = make(chan error, 1)
 	go func() {
@@ -9,7 +9,7 @@ import (
 	"testing"
 	"time"

-	"github.com/memohai/memoh/internal/tts"
+	"github.com/memohai/memoh/internal/audio"
 )

 // Real Edge TTS integration tests. Not compiled by default (requires -tags=integration).
@@ -17,14 +17,14 @@ import (
 //
 // Run:
 //
-//	go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS -v
+//	go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS -v

 func TestRealEdgeTTS_Synthesize(t *testing.T) {
 	client := NewEdgeWsClient()
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()

-	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
+	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
 	audio, err := client.Synthesize(ctx, "Hello, this is a real Edge TTS test.", config)
 	if err != nil {
 		t.Fatalf("Synthesize: %v", err)
@@ -40,7 +40,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()

-	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
+	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}}
 	ch, errCh := client.Stream(ctx, "你好，这是流式测试。", config)
 	var total int
 	for b := range ch {
@@ -57,7 +57,7 @@ func TestRealEdgeTTS_Stream(t *testing.T) {

 // TestRealEdgeTTS_Formats tries every candidate format and reports which ones are supported.
 //
-//	go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_Formats -v
+//	go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_Formats -v
 func TestRealEdgeTTS_Formats(t *testing.T) {
 	formats := []string{
 		"audio-24khz-48kbitrate-mono-mp3",
@@ -71,8 +71,8 @@ func TestRealEdgeTTS_Formats(t *testing.T) {
 			ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 			defer cancel()

-			config := tts.AudioConfig{
-				Voice:  tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
+			config := audio.AudioConfig{
+				Voice:  audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"},
 				Format: fmt,
 				Speed:  1.0,
 			}
@@ -88,7 +88,7 @@ func TestRealEdgeTTS_Formats(t *testing.T) {

 // TestRealEdgeTTS_SaveAudio synthesizes speech and writes the result to a file for manual inspection.
 //
-//	go test -tags=integration ./internal/tts/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
+//	go test -tags=integration ./internal/audio/adapter/edge/... -run TestRealEdgeTTS_SaveAudio -v
 func TestRealEdgeTTS_SaveAudio(t *testing.T) {
 	client := NewEdgeWsClient()
 	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
@@ -97,11 +97,11 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {
 	cases := []struct {
 		name  string
 		text  string
-		voice tts.VoiceConfig
+		voice audio.VoiceConfig
 		file  string
 	}{
-		{"en", "Hello, this is an Edge TTS audio save test.", tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
-		{"zh", "你好，这是一段中文语音合成测试。", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
+		{"en", "Hello, this is an Edge TTS audio save test.", audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, "test_en.mp3"},
+		{"zh", "你好，这是一段中文语音合成测试。", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, "test_zh.mp3"},
 	}

 	outDir := filepath.Join(os.TempDir(), "edge_tts_test")
@@ -111,7 +111,7 @@ func TestRealEdgeTTS_SaveAudio(t *testing.T) {

 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
-			config := tts.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
+			config := audio.AudioConfig{Voice: tc.voice, Speed: 1.0, Pitch: -10.0}
 			audio, err := client.Synthesize(ctx, tc.text, config)
 			if err != nil {
 				t.Fatalf("Synthesize: %v", err)
@@ -11,7 +11,7 @@ import (

 	"github.com/gorilla/websocket"

-	"github.com/memohai/memoh/internal/tts"
+	"github.com/memohai/memoh/internal/audio"
 )

 var upgrader = websocket.Upgrader{
@@ -95,7 +95,7 @@ func TestEdgeWsClient_ConnectAndSynthesize(t *testing.T) {
 	client := NewEdgeWsClient()
 	client.BaseURL = wsURL

-	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
+	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}, Speed: 1.0}
 	audio, err := client.Synthesize(t.Context(), "Hello world", config)
 	if err != nil {
 		t.Fatalf("Synthesize: %v", err)
@@ -114,7 +114,7 @@ func TestEdgeWsClient_Stream(t *testing.T) {
 	client := NewEdgeWsClient()
 	client.BaseURL = wsURL

-	config := tts.AudioConfig{Voice: tts.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
+	config := audio.AudioConfig{Voice: audio.VoiceConfig{ID: "en-US-JennyNeural", Lang: "en-US"}}
 	ch, errCh := client.Stream(t.Context(), "Hi", config)
 	var chunks [][]byte
 	for b := range ch {
@@ -197,7 +197,7 @@ func TestParseAudioChunk_EmptyOrShort(t *testing.T) {

 func TestBuildSSML(t *testing.T) {
 	t.Parallel()
-	ssml := buildSSML("Hello", tts.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
+	ssml := buildSSML("Hello", audio.VoiceConfig{ID: "zh-CN-XiaoxiaoNeural", Lang: "zh-CN"}, 1.0, 0)
 	if !strings.Contains(ssml, "zh-CN-XiaoxiaoNeural") {
 		t.Errorf("ssml should contain voice: %s", ssml)
 	}
@@ -1,11 +1,13 @@
-package tts
+package audio

 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"log/slog"

+	"github.com/jackc/pgx/v5"
 	"github.com/jackc/pgx/v5/pgtype"

 	"github.com/memohai/memoh/internal/db/sqlc"
@@ -14,23 +16,23 @@ import (

 func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
 	for _, def := range registry.List() {
-		configJSON, err := json.Marshal(map[string]any{})
+		provider, err := queries.GetProviderByClientType(ctx, string(def.ClientType))
 		if err != nil {
-			return fmt.Errorf("marshal speech provider config: %w", err)
-		}
-		var icon pgtype.Text
-		if def.Icon != "" {
-			icon = pgtype.Text{String: def.Icon, Valid: true}
-		}
-
-		provider, err := queries.UpsertRegistryProvider(ctx, sqlc.UpsertRegistryProviderParams{
-			Name:       def.DisplayName,
-			ClientType: string(def.ClientType),
-			Icon:       icon,
-			Config:     configJSON,
-		})
-		if err != nil {
-			return fmt.Errorf("upsert speech provider %s: %w", def.ClientType, err)
+			if errors.Is(err, pgx.ErrNoRows) {
+				if logger != nil {
+					logger.Warn("audio registry skipped provider without template",
+						slog.String("provider", string(def.ClientType)),
+						slog.String("display_name", def.DisplayName))
+				}
+				continue
+			}
+			if logger != nil {
+				logger.Warn("audio registry failed to load provider template",
+					slog.String("provider", string(def.ClientType)),
+					slog.String("display_name", def.DisplayName),
+					slog.Any("error", err))
+			}
+			return fmt.Errorf("get provider by client type %s: %w", def.ClientType, err)
 		}

 		synced := 0
@@ -1,4 +1,4 @@
-package tts
+package audio

 // VoiceConfig is kept for backward compatibility with the legacy Edge adapter tests.
 type VoiceConfig struct {
@@ -1,4 +1,4 @@
-package tts
+package audio

 import (
 	"fmt"
@@ -1,4 +1,4 @@
-package tts
+package audio

 import (
 	"context"
@@ -24,7 +24,7 @@ type Service struct {
 func NewService(log *slog.Logger, queries *sqlc.Queries, registry *Registry) *Service {
 	return &Service{
 		queries:  queries,
-		logger:   log.With(slog.String("service", "tts")),
+		logger:   log.With(slog.String("service", "audio")),
 		registry: registry,
 	}
 }
@@ -1,4 +1,4 @@
-package tts
+package audio

 import (
 	"fmt"
@@ -13,7 +13,7 @@ import (
 const (
 	defaultTTL      = 10 * time.Minute
 	cleanupInterval = 1 * time.Minute
-	tempDirName     = "tts_temp"
+	tempDirName     = "audio_temp"
 )

 // TempStore manages temporary audio files on disk with automatic TTL-based cleanup.
@@ -30,7 +30,7 @@ type TempStore struct {
 func NewTempStore(baseDir string) (*TempStore, error) {
 	dir := filepath.Join(baseDir, tempDirName)
 	if err := os.MkdirAll(dir, 0o750); err != nil {
-		return nil, fmt.Errorf("create tts temp dir: %w", err)
+		return nil, fmt.Errorf("create audio temp dir: %w", err)
 	}
 	return &TempStore{
 		dir:     dir,
@@ -1,4 +1,4 @@
-package tts
+package audio

 import "time"

@@ -58,14 +58,14 @@ type mediaIngestor interface {
 	channel.ContainerAttachmentIngester
 }

-// ttsSynthesizer synthesizes text to speech audio.
-type ttsSynthesizer interface {
+// speechSynthesizer synthesizes text to speech audio.
+type speechSynthesizer interface {
 	Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
 }

-// ttsModelResolver looks up the TTS model ID configured for a bot.
-type ttsModelResolver interface {
-	ResolveTtsModelID(ctx context.Context, botID string) (string, error)
+// speechModelResolver looks up the speech model ID configured for a bot.
+type speechModelResolver interface {
+	ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
 }

 // TranscriptionResult is the minimal speech-to-text response shape needed by inbound routing.
@@ -101,29 +101,29 @@ type SessionResult struct {

 // ChannelInboundProcessor routes channel inbound messages to the chat gateway.
 type ChannelInboundProcessor struct {
-	runner           flow.Runner
-	routeResolver    RouteResolver
-	message          messagepkg.Writer
-	mediaService     mediaIngestor
-	reactor          channelReactor
-	commandHandler   *command.Handler
-	registry         *channel.Registry
-	logger           *slog.Logger
-	jwtSecret        string
-	tokenTTL         time.Duration
-	identity         *IdentityResolver
-	policy           PolicyService
-	dispatcher       *RouteDispatcher
-	acl              chatACL
-	observer         channel.StreamObserver
-	ttsService       ttsSynthesizer
-	ttsModelResolver ttsModelResolver
-	transcriber      transcriptionRecognizer
-	sttModelResolver transcriptionModelResolver
-	sessionEnsurer   SessionEnsurer
-	pipeline         *pipelinepkg.Pipeline
-	eventStore       *pipelinepkg.EventStore
-	discussDriver    *pipelinepkg.DiscussDriver
+	runner              flow.Runner
+	routeResolver       RouteResolver
+	message             messagepkg.Writer
+	mediaService        mediaIngestor
+	reactor             channelReactor
+	commandHandler      *command.Handler
+	registry            *channel.Registry
+	logger              *slog.Logger
+	jwtSecret           string
+	tokenTTL            time.Duration
+	identity            *IdentityResolver
+	policy              PolicyService
+	dispatcher          *RouteDispatcher
+	acl                 chatACL
+	observer            channel.StreamObserver
+	speechService       speechSynthesizer
+	speechModelResolver speechModelResolver
+	transcriber         transcriptionRecognizer
+	sttModelResolver    transcriptionModelResolver
+	sessionEnsurer      SessionEnsurer
+	pipeline            *pipelinepkg.Pipeline
+	eventStore          *pipelinepkg.EventStore
+	discussDriver       *pipelinepkg.DiscussDriver

 	// activeStreams maps "botID:routeID" to a context.CancelFunc for the
 	// currently running agent stream. Used by /stop to abort generation
@@ -205,14 +205,14 @@ func (p *ChannelInboundProcessor) SetStreamObserver(observer channel.StreamObser
 	p.observer = observer
 }

-// SetTtsService configures the TTS synthesizer and settings reader for handling
-// <speech> tag events (speech_delta) that require server-side audio synthesis.
-func (p *ChannelInboundProcessor) SetTtsService(synth ttsSynthesizer, modelResolver ttsModelResolver) {
+// SetSpeechService configures the speech synthesizer and settings reader for
+// handling <speech> tag events (speech_delta) that require server-side audio synthesis.
+func (p *ChannelInboundProcessor) SetSpeechService(synth speechSynthesizer, modelResolver speechModelResolver) {
 	if p == nil {
 		return
 	}
-	p.ttsService = synth
-	p.ttsModelResolver = modelResolver
+	p.speechService = synth
+	p.speechModelResolver = modelResolver
 }

 // SetTranscriptionService configures speech-to-text processing for inbound audio attachments.
@@ -2304,13 +2304,13 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
 	outboundAssetRefs *[]conversation.OutboundAssetRef,
 	assetMu *sync.Mutex,
 ) {
-	if p.ttsService == nil || p.ttsModelResolver == nil {
+	if p.speechService == nil || p.speechModelResolver == nil {
 		if p.logger != nil {
 			p.logger.Warn("speech_delta received but TTS service not configured")
 		}
 		return
 	}
-	modelID, err := p.ttsModelResolver.ResolveTtsModelID(ctx, botID)
+	modelID, err := p.speechModelResolver.ResolveSpeechModelID(ctx, botID)
 	if err != nil || strings.TrimSpace(modelID) == "" {
 		if p.logger != nil {
 			p.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
@@ -2322,7 +2322,7 @@ func (p *ChannelInboundProcessor) synthesizeAndPushVoice(
 		if text == "" {
 			continue
 		}
-		audioData, contentType, synthErr := p.ttsService.Synthesize(ctx, modelID, text, nil)
+		audioData, contentType, synthErr := p.speechService.Synthesize(ctx, modelID, text, nil)
 		if synthErr != nil {
 			if p.logger != nil {
 				p.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
@@ -318,6 +318,27 @@ func (q *Queries) GetModelByProviderAndModelID(ctx context.Context, arg GetModel
 	return i, err
 }

+const getProviderByClientType = `-- name: GetProviderByClientType :one
+SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE client_type = $1
+`
+
+func (q *Queries) GetProviderByClientType(ctx context.Context, clientType string) (Provider, error) {
+	row := q.db.QueryRow(ctx, getProviderByClientType, clientType)
+	var i Provider
+	err := row.Scan(
+		&i.ID,
+		&i.Name,
+		&i.ClientType,
+		&i.Icon,
+		&i.Enable,
+		&i.Config,
+		&i.Metadata,
+		&i.CreatedAt,
+		&i.UpdatedAt,
+	)
+	return i, err
+}
+
 const getProviderByID = `-- name: GetProviderByID :one
 SELECT id, name, client_type, icon, enable, config, metadata, created_at, updated_at FROM providers WHERE id = $1
 `
@@ -7,28 +7,28 @@ import (

 	"github.com/labstack/echo/v4"

+	audiopkg "github.com/memohai/memoh/internal/audio"
 	"github.com/memohai/memoh/internal/settings"
-	"github.com/memohai/memoh/internal/tts"
 )

-// BotTtsHandler handles per-bot TTS synthesis requests from the agent tool.
-type BotTtsHandler struct {
-	ttsService      *tts.Service
+// BotAudioHandler handles per-bot speech synthesis requests from the agent tool.
+type BotAudioHandler struct {
+	audioService    *audiopkg.Service
 	settingsService *settings.Service
-	tempStore       *tts.TempStore
+	tempStore       *audiopkg.TempStore
 	logger          *slog.Logger
 }

-func NewBotTtsHandler(log *slog.Logger, ttsService *tts.Service, settingsService *settings.Service, tempStore *tts.TempStore) *BotTtsHandler {
-	return &BotTtsHandler{
-		ttsService:      ttsService,
+func NewBotAudioHandler(log *slog.Logger, audioService *audiopkg.Service, settingsService *settings.Service, tempStore *audiopkg.TempStore) *BotAudioHandler {
+	return &BotAudioHandler{
+		audioService:    audioService,
 		settingsService: settingsService,
 		tempStore:       tempStore,
-		logger:          log.With(slog.String("handler", "bot_tts")),
+		logger:          log.With(slog.String("handler", "bot_audio")),
 	}
 }

-func (h *BotTtsHandler) Register(e *echo.Echo) {
+func (h *BotAudioHandler) Register(e *echo.Echo) {
 	e.POST("/bots/:bot_id/tts/synthesize", h.Synthesize)
 }

@@ -54,7 +54,7 @@ type synthesizeResponse struct {
 // @Failure 400 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /bots/{bot_id}/tts/synthesize [post].
-func (h *BotTtsHandler) Synthesize(c echo.Context) error {
+func (h *BotAudioHandler) Synthesize(c echo.Context) error {
 	botID := strings.TrimSpace(c.Param("bot_id"))
 	if botID == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "bot_id is required")
@@ -88,10 +88,10 @@ func (h *BotTtsHandler) Synthesize(c echo.Context) error {
 		return echo.NewHTTPError(http.StatusInternalServerError, "failed to create temp file")
 	}

-	contentType, streamErr := h.ttsService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
+	contentType, streamErr := h.audioService.StreamToFile(c.Request().Context(), botSettings.TtsModelID, text, f)
 	closeErr := f.Close()
 	if streamErr != nil {
-		h.logger.Error("tts synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
+		h.logger.Error("speech synthesis failed", slog.String("bot_id", botID), slog.String("model_id", botSettings.TtsModelID), slog.Any("error", streamErr))
 		h.tempStore.Delete(tempID)
 		return echo.NewHTTPError(http.StatusInternalServerError, streamErr.Error())
 	}
@@ -30,30 +30,30 @@ import (
 	messagepkg "github.com/memohai/memoh/internal/message"
 )

-// localTtsSynthesizer synthesizes text to speech audio.
-type localTtsSynthesizer interface {
+// localSpeechSynthesizer synthesizes text to speech audio.
+type localSpeechSynthesizer interface {
 	Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error)
 }

-// localTtsModelResolver resolves TTS model IDs for bots.
-type localTtsModelResolver interface {
-	ResolveTtsModelID(ctx context.Context, botID string) (string, error)
+// localSpeechModelResolver resolves speech model IDs for bots.
+type localSpeechModelResolver interface {
+	ResolveSpeechModelID(ctx context.Context, botID string) (string, error)
 }

 // LocalChannelHandler handles local channel routes (WebUI / API) backed by bot history.
 type LocalChannelHandler struct {
-	channelType      channel.ChannelType
-	channelManager   *channel.Manager
-	channelStore     *channel.Store
-	chatService      *conversation.Service
-	routeHub         *local.RouteHub
-	botService       *bots.Service
-	accountService   *accounts.Service
-	resolver         *flow.Resolver
-	mediaService     *media.Service
-	ttsService       localTtsSynthesizer
-	ttsModelResolver localTtsModelResolver
-	logger           *slog.Logger
+	channelType         channel.ChannelType
+	channelManager      *channel.Manager
+	channelStore        *channel.Store
+	chatService         *conversation.Service
+	routeHub            *local.RouteHub
+	botService          *bots.Service
+	accountService      *accounts.Service
+	resolver            *flow.Resolver
+	mediaService        *media.Service
+	speechService       localSpeechSynthesizer
+	speechModelResolver localSpeechModelResolver
+	logger              *slog.Logger
 }

 // NewLocalChannelHandler creates a local channel handler.
@@ -80,10 +80,10 @@ func (h *LocalChannelHandler) SetMediaService(svc *media.Service) {
 	h.mediaService = svc
 }

-// SetTtsService configures TTS synthesis for handling speech_delta events.
-func (h *LocalChannelHandler) SetTtsService(synth localTtsSynthesizer, resolver localTtsModelResolver) {
-	h.ttsService = synth
-	h.ttsModelResolver = resolver
+// SetSpeechService configures speech synthesis for handling speech_delta events.
+func (h *LocalChannelHandler) SetSpeechService(synth localSpeechSynthesizer, resolver localSpeechModelResolver) {
+	h.speechService = synth
+	h.speechModelResolver = resolver
 }

 // Register registers the local channel routes.
@@ -719,12 +719,12 @@ func (h *LocalChannelHandler) ingestSingleAttachment(ctx context.Context, botID,
 // wsSynthesizeSpeech handles speech_delta events by synthesizing audio and
 // injecting attachment_delta events with the resulting voice attachments.
 func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID string, original json.RawMessage) []json.RawMessage {
-	if h.ttsService == nil || h.ttsModelResolver == nil {
+	if h.speechService == nil || h.speechModelResolver == nil {
 		h.logger.Warn("speech_delta received but TTS service not configured")
 		return nil
 	}

-	modelID, err := h.ttsModelResolver.ResolveTtsModelID(ctx, botID)
+	modelID, err := h.speechModelResolver.ResolveSpeechModelID(ctx, botID)
 	if err != nil || strings.TrimSpace(modelID) == "" {
 		h.logger.Warn("speech_delta: bot has no TTS model configured", slog.String("bot_id", botID))
 		return nil
@@ -746,7 +746,7 @@ func (h *LocalChannelHandler) wsSynthesizeSpeech(ctx context.Context, botID stri
 			continue
 		}

-		audioData, contentType, synthErr := h.ttsService.Synthesize(ctx, modelID, text, nil)
+		audioData, contentType, synthErr := h.speechService.Synthesize(ctx, modelID, text, nil)
 		if synthErr != nil {
 			h.logger.Warn("speech synthesis failed", slog.String("bot_id", botID), slog.Any("error", synthErr))
 			continue
@@ -12,25 +12,25 @@ import (

 	"github.com/labstack/echo/v4"

+	audiopkg "github.com/memohai/memoh/internal/audio"
 	"github.com/memohai/memoh/internal/models"
-	"github.com/memohai/memoh/internal/tts"
 )

-type SpeechHandler struct {
-	service       *tts.Service
+type AudioHandler struct {
+	service       *audiopkg.Service
 	modelsService *models.Service
 	logger        *slog.Logger
 }

-func NewSpeechHandler(log *slog.Logger, service *tts.Service, modelsService *models.Service) *SpeechHandler {
-	return &SpeechHandler{
+func NewAudioHandler(log *slog.Logger, service *audiopkg.Service, modelsService *models.Service) *AudioHandler {
+	return &AudioHandler{
 		service:       service,
 		modelsService: modelsService,
-		logger:        log.With(slog.String("handler", "speech")),
+		logger:        log.With(slog.String("handler", "audio")),
 	}
 }

-func (h *SpeechHandler) Register(e *echo.Echo) {
+func (h *AudioHandler) Register(e *echo.Echo) {
 	pg := e.Group("/speech-providers")
 	pg.GET("", h.ListProviders)
 	pg.GET("/:id", h.GetProvider)
@@ -64,13 +64,19 @@ func (h *SpeechHandler) Register(e *echo.Echo) {
 // @Summary List speech provider metadata
 // @Description List available speech provider types with their models and capabilities
 // @Tags speech-providers
-// @Success 200 {array} tts.ProviderMetaResponse
+// @Success 200 {array} audiopkg.ProviderMetaResponse
 // @Router /speech-providers/meta [get].
-func (h *SpeechHandler) ListSpeechMeta(c echo.Context) error {
+func (h *AudioHandler) ListSpeechMeta(c echo.Context) error {
 	return c.JSON(http.StatusOK, h.service.ListSpeechMeta(c.Request().Context()))
 }

-func (h *SpeechHandler) ListTranscriptionMeta(c echo.Context) error {
+// ListTranscriptionMeta godoc
+// @Summary List transcription provider metadata
+// @Description List available transcription provider types with their models and capabilities
+// @Tags transcription-providers
+// @Success 200 {array} audiopkg.ProviderMetaResponse
+// @Router /transcription-providers/meta [get].
+func (h *AudioHandler) ListTranscriptionMeta(c echo.Context) error {
 	return c.JSON(http.StatusOK, h.service.ListTranscriptionMeta(c.Request().Context()))
 }

@@ -79,10 +85,10 @@ func (h *SpeechHandler) ListTranscriptionMeta(c echo.Context) error {
 // @Description List providers that support speech (filtered view of unified providers table)
 // @Tags speech-providers
 // @Produce json
-// @Success 200 {array} tts.SpeechProviderResponse
+// @Success 200 {array} audiopkg.SpeechProviderResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-providers [get].
-func (h *SpeechHandler) ListProviders(c echo.Context) error {
+func (h *AudioHandler) ListProviders(c echo.Context) error {
 	items, err := h.service.ListSpeechProviders(c.Request().Context())
 	if err != nil {
 		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -90,7 +96,15 @@ func (h *SpeechHandler) ListProviders(c echo.Context) error {
 	return c.JSON(http.StatusOK, items)
 }

-func (h *SpeechHandler) ListTranscriptionProviders(c echo.Context) error {
+// ListTranscriptionProviders godoc
+// @Summary List transcription providers
+// @Description List providers that support transcription (filtered view of unified providers table)
+// @Tags transcription-providers
+// @Produce json
+// @Success 200 {array} audiopkg.SpeechProviderResponse
+// @Failure 500 {object} ErrorResponse
+// @Router /transcription-providers [get].
+func (h *AudioHandler) ListTranscriptionProviders(c echo.Context) error {
 	items, err := h.service.ListTranscriptionProviders(c.Request().Context())
 	if err != nil {
 		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -104,11 +118,12 @@ func (h *SpeechHandler) ListTranscriptionProviders(c echo.Context) error {
 // @Tags speech-providers
 // @Produce json
 // @Param id path string true "Provider ID (UUID)"
-// @Success 200 {object} tts.SpeechProviderResponse
+// @Success 200 {object} audiopkg.SpeechProviderResponse
 // @Failure 400 {object} ErrorResponse
 // @Failure 404 {object} ErrorResponse
 // @Router /speech-providers/{id} [get].
-func (h *SpeechHandler) GetProvider(c echo.Context) error {
+// @Router /transcription-providers/{id} [get].
+func (h *AudioHandler) GetProvider(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -126,11 +141,11 @@ func (h *SpeechHandler) GetProvider(c echo.Context) error {
 // @Tags speech-providers
 // @Produce json
 // @Param id path string true "Provider ID (UUID)"
-// @Success 200 {array} tts.SpeechModelResponse
+// @Success 200 {array} audiopkg.SpeechModelResponse
 // @Failure 400 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-providers/{id}/models [get].
-func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
+func (h *AudioHandler) ListModelsByProvider(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -149,12 +164,12 @@ func (h *SpeechHandler) ListModelsByProvider(c echo.Context) error {
 // @Accept json
 // @Produce json
 // @Param id path string true "Provider ID (UUID)"
-// @Success 200 {object} tts.ImportModelsResponse
+// @Success 200 {object} audiopkg.ImportModelsResponse
 // @Failure 400 {object} ErrorResponse
 // @Failure 404 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-providers/{id}/import-models [post].
-func (h *SpeechHandler) ImportModels(c echo.Context) error {
+func (h *AudioHandler) ImportModels(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -165,7 +180,7 @@ func (h *SpeechHandler) ImportModels(c echo.Context) error {
 		return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote speech models: %v", err))
 	}

-	resp := tts.ImportModelsResponse{
+	resp := audiopkg.ImportModelsResponse{
 		Models: make([]string, 0, len(remoteModels)),
 	}

@@ -197,7 +212,17 @@ func (h *SpeechHandler) ImportModels(c echo.Context) error {
 	return c.JSON(http.StatusOK, resp)
 }

-func (h *SpeechHandler) ListTranscriptionModelsByProvider(c echo.Context) error {
+// ListTranscriptionModelsByProvider godoc
+// @Summary List transcription models by provider
+// @Description List models of type 'transcription' for a specific transcription provider
+// @Tags transcription-providers
+// @Produce json
+// @Param id path string true "Provider ID (UUID)"
+// @Success 200 {array} audiopkg.TranscriptionModelResponse
+// @Failure 400 {object} ErrorResponse
+// @Failure 500 {object} ErrorResponse
+// @Router /transcription-providers/{id}/models [get].
+func (h *AudioHandler) ListTranscriptionModelsByProvider(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -209,7 +234,19 @@ func (h *SpeechHandler) ListTranscriptionModelsByProvider(c echo.Context) error
 	return c.JSON(http.StatusOK, items)
 }

-func (h *SpeechHandler) ImportTranscriptionModels(c echo.Context) error {
+// ImportTranscriptionModels godoc
+// @Summary Import transcription models from provider
+// @Description Fetch models using the configured transcription provider and import them into the unified models table
+// @Tags transcription-providers
+// @Accept json
+// @Produce json
+// @Param id path string true "Provider ID (UUID)"
+// @Success 200 {object} audiopkg.ImportModelsResponse
+// @Failure 400 {object} ErrorResponse
+// @Failure 404 {object} ErrorResponse
+// @Failure 500 {object} ErrorResponse
+// @Router /transcription-providers/{id}/import-models [post].
+func (h *AudioHandler) ImportTranscriptionModels(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -220,7 +257,7 @@ func (h *SpeechHandler) ImportTranscriptionModels(c echo.Context) error {
 		return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote transcription models: %v", err))
 	}

-	resp := tts.ImportModelsResponse{
+	resp := audiopkg.ImportModelsResponse{
 		Models: make([]string, 0, len(remoteModels)),
 	}

@@ -257,10 +294,10 @@ func (h *SpeechHandler) ImportTranscriptionModels(c echo.Context) error {
 // @Description List all models of type 'speech' (filtered view of unified models table)
 // @Tags speech-models
 // @Produce json
-// @Success 200 {array} tts.SpeechModelResponse
+// @Success 200 {array} audiopkg.SpeechModelResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-models [get].
-func (h *SpeechHandler) ListModels(c echo.Context) error {
+func (h *AudioHandler) ListModels(c echo.Context) error {
 	items, err := h.service.ListSpeechModels(c.Request().Context())
 	if err != nil {
 		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -268,7 +305,15 @@ func (h *SpeechHandler) ListModels(c echo.Context) error {
 	return c.JSON(http.StatusOK, items)
 }

-func (h *SpeechHandler) ListTranscriptionModels(c echo.Context) error {
+// ListTranscriptionModels godoc
+// @Summary List all transcription models
+// @Description List all models of type 'transcription' (filtered view of unified models table)
+// @Tags transcription-models
+// @Produce json
+// @Success 200 {array} audiopkg.TranscriptionModelResponse
+// @Failure 500 {object} ErrorResponse
+// @Router /transcription-models [get].
+func (h *AudioHandler) ListTranscriptionModels(c echo.Context) error {
 	items, err := h.service.ListTranscriptionModels(c.Request().Context())
 	if err != nil {
 		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
@@ -281,10 +326,10 @@ func (h *SpeechHandler) ListTranscriptionModels(c echo.Context) error {
 // @Tags speech-models
 // @Produce json
 // @Param id path string true "Model ID"
-// @Success 200 {object} tts.SpeechModelResponse
+// @Success 200 {object} audiopkg.SpeechModelResponse
 // @Failure 404 {object} ErrorResponse
 // @Router /speech-models/{id} [get].
-func (h *SpeechHandler) GetModel(c echo.Context) error {
+func (h *AudioHandler) GetModel(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -296,12 +341,23 @@ func (h *SpeechHandler) GetModel(c echo.Context) error {
 	return c.JSON(http.StatusOK, resp)
 }

-func (h *SpeechHandler) UpdateModel(c echo.Context) error {
+// UpdateModel godoc
+// @Summary Update a speech model
+// @Tags speech-models
+// @Accept json
+// @Produce json
+// @Param id path string true "Model ID"
+// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
+// @Success 200 {object} audiopkg.SpeechModelResponse
+// @Failure 400 {object} ErrorResponse
+// @Failure 500 {object} ErrorResponse
+// @Router /speech-models/{id} [put].
+func (h *AudioHandler) UpdateModel(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
 	}
-	var req tts.UpdateSpeechModelRequest
+	var req audiopkg.UpdateSpeechModelRequest
 	if err := c.Bind(&req); err != nil {
 		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
 	}
@@ -312,7 +368,15 @@ func (h *SpeechHandler) UpdateModel(c echo.Context) error {
 	return c.JSON(http.StatusOK, resp)
 }

-func (h *SpeechHandler) GetTranscriptionModel(c echo.Context) error {
+// GetTranscriptionModel godoc
+// @Summary Get a transcription model
+// @Tags transcription-models
+// @Produce json
+// @Param id path string true "Model ID"
+// @Success 200 {object} audiopkg.TranscriptionModelResponse
+// @Failure 404 {object} ErrorResponse
+// @Router /transcription-models/{id} [get].
+func (h *AudioHandler) GetTranscriptionModel(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -324,12 +388,23 @@ func (h *SpeechHandler) GetTranscriptionModel(c echo.Context) error {
 	return c.JSON(http.StatusOK, resp)
 }

-func (h *SpeechHandler) UpdateTranscriptionModel(c echo.Context) error {
+// UpdateTranscriptionModel godoc
+// @Summary Update a transcription model
+// @Tags transcription-models
+// @Accept json
+// @Produce json
+// @Param id path string true "Model ID"
+// @Param request body audiopkg.UpdateSpeechModelRequest true "Model update payload"
+// @Success 200 {object} audiopkg.TranscriptionModelResponse
+// @Failure 400 {object} ErrorResponse
+// @Failure 500 {object} ErrorResponse
+// @Router /transcription-models/{id} [put].
+func (h *AudioHandler) UpdateTranscriptionModel(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
 	}
-	var req tts.UpdateSpeechModelRequest
+	var req audiopkg.UpdateSpeechModelRequest
 	if err := c.Bind(&req); err != nil {
 		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
 	}
@@ -345,10 +420,10 @@ func (h *SpeechHandler) UpdateTranscriptionModel(c echo.Context) error {
 // @Tags speech-models
 // @Produce json
 // @Param id path string true "Model ID"
-// @Success 200 {object} tts.ModelCapabilities
+// @Success 200 {object} audiopkg.ModelCapabilities
 // @Failure 404 {object} ErrorResponse
 // @Router /speech-models/{id}/capabilities [get].
-func (h *SpeechHandler) GetModelCapabilities(c echo.Context) error {
+func (h *AudioHandler) GetModelCapabilities(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -360,7 +435,15 @@ func (h *SpeechHandler) GetModelCapabilities(c echo.Context) error {
 	return c.JSON(http.StatusOK, caps)
 }

-func (h *SpeechHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
+// GetTranscriptionModelCapabilities godoc
+// @Summary Get transcription model capabilities
+// @Tags transcription-models
+// @Produce json
+// @Param id path string true "Model ID"
+// @Success 200 {object} audiopkg.ModelCapabilities
+// @Failure 404 {object} ErrorResponse
+// @Router /transcription-models/{id}/capabilities [get].
+func (h *AudioHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -379,17 +462,17 @@ func (h *SpeechHandler) GetTranscriptionModelCapabilities(c echo.Context) error
 // @Accept json
 // @Produce application/octet-stream
 // @Param id path string true "Model ID"
-// @Param request body tts.TestSynthesizeRequest true "Text to synthesize"
+// @Param request body audiopkg.TestSynthesizeRequest true "Text to synthesize"
 // @Success 200 {file} binary "Audio data"
 // @Failure 400 {object} ErrorResponse
 // @Failure 500 {object} ErrorResponse
 // @Router /speech-models/{id}/test [post].
-func (h *SpeechHandler) TestModel(c echo.Context) error {
+func (h *AudioHandler) TestModel(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
 	}
-	var req tts.TestSynthesizeRequest
+	var req audiopkg.TestSynthesizeRequest
 	if err := c.Bind(&req); err != nil {
 		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
 	}
@@ -408,7 +491,20 @@ func (h *SpeechHandler) TestModel(c echo.Context) error {
 	return c.Blob(http.StatusOK, contentType, audio)
 }

-func (h *SpeechHandler) TestTranscriptionModel(c echo.Context) error {
+// TestTranscriptionModel godoc
+// @Summary Test transcription model recognition
+// @Description Transcribe uploaded audio using a specific model's config and return structured text output
+// @Tags transcription-models
+// @Accept mpfd
+// @Produce json
+// @Param id path string true "Model ID"
+// @Param file formData file true "Audio file"
+// @Param config formData string false "Optional JSON config"
+// @Success 200 {object} audiopkg.TestTranscriptionResponse
+// @Failure 400 {object} ErrorResponse
+// @Failure 500 {object} ErrorResponse
+// @Router /transcription-models/{id}/test [post].
+func (h *AudioHandler) TestTranscriptionModel(c echo.Context) error {
 	id := strings.TrimSpace(c.Param("id"))
 	if id == "" {
 		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
@@ -441,16 +537,16 @@ func (h *SpeechHandler) TestTranscriptionModel(c echo.Context) error {
 	if err != nil {
 		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
 	}
-	resp := tts.TestTranscriptionResponse{
+	resp := audiopkg.TestTranscriptionResponse{
 		Text:            result.Text,
 		Language:        result.Language,
 		DurationSeconds: result.DurationSeconds,
 		Metadata:        result.ProviderMetadata,
 	}
 	if len(result.Words) > 0 {
-		resp.Words = make([]tts.TranscriptionWord, 0, len(result.Words))
+		resp.Words = make([]audiopkg.TranscriptionWord, 0, len(result.Words))
 		for _, word := range result.Words {
-			resp.Words = append(resp.Words, tts.TranscriptionWord{
+			resp.Words = append(resp.Words, audiopkg.TranscriptionWord{
 				Text:      word.Text,
 				Start:     word.Start,
 				End:       word.End,